XMLSpider

[email protected] Thu, 16 Aug 2007 18:02:32 +0000 (UTC)

Author: swatig0
Date: 2007-08-16 18:02:31 +0000 (Thu, 16 Aug 2007)
New Revision: 14723


Modified:
   trunk/plugins/XMLSpider/XMLSpider.java
Log:
Outlinks-inlinks for a page

Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java      2007-08-16 15:12:02 UTC (rev 
14722)
+++ trunk/plugins/XMLSpider/XMLSpider.java      2007-08-16 18:02:31 UTC (rev 
14723)
@@ -81,7 +81,7 @@
  *  @author swati goyal
  *  
  */
-public class XMLSpider implements FredPlugin, FredPluginHTTP, 
FredPluginThreadless,  FredPluginHTTPAdvanced,HttpPlugin, ClientCallback, 
FoundURICallback ,USKCallback{
+public class XMLSpider implements FredPlugin, FredPluginHTTP, 
FredPluginThreadless,  FredPluginHTTPAdvanced,HttpPlugin, ClientCallback, 
USKCallback{

        long tProducedIndex;
        private TreeMap tMap = new TreeMap();
@@ -89,25 +89,33 @@
        // URIs visited, or fetching, or queued. Added once then forgotten 
about.
        private final HashSet visitedURIs = new HashSet();
        private final HashSet urisWithWords = new HashSet();
+       private final HashSet idsWithWords = new HashSet();
        private final HashSet failedURIs = new HashSet();
        private final HashSet queuedURISet = new HashSet();
        private final LinkedList queuedURIList = new LinkedList();
        private final HashMap runningFetchesByURI = new HashMap();
        private final HashMap urisByWord = new HashMap();
+       private final HashMap idsByWord = new HashMap();
        private final HashMap titlesOfURIs = new HashMap();
+       private final HashMap titlesOfIds = new HashMap();
+       private final HashMap uriIds = new HashMap();
+       private final HashMap idUris = new HashMap();
+       private final HashMap outlinks = new HashMap();
+       private final HashMap inlinks = new HashMap();
        private Vector indices;
        private int match;
+       private int id;
        private Vector list;
        private boolean indexing ;

-       private static final int minTimeBetweenEachIndexRewriting = 50;
+       private static final int minTimeBetweenEachIndexRewriting = 10;
 /**
  * DEFAULT_INDEX_DIR is the directory where the generated indices are stored.
  * Needs to be created before it can be used
  */
-       private static final String DEFAULT_INDEX_DIR = "myindex3/";
+       private static final String DEFAULT_INDEX_DIR = "myindex4/";
        public Set allowedMIMETypes;
-       private static final int MAX_ENTRIES = 30;
+       private static final int MAX_ENTRIES = 10;
        private static final String pluginName = "XML spider";
        /**
         * This gives the allowed fraction of total time spent on generating 
indices
@@ -120,9 +128,10 @@
        private static final String indexOwnerEmail = null;
        private final HashMap sizeOfURIs = new HashMap(); /* String (URI) -> 
Long */
        private final HashMap mimeOfURIs = new HashMap(); /* String (URI) -> 
String */
-       private final HashMap lastPositionByURI = new HashMap(); /* String 
(URI) -> Integer */ /* Use to determine word position on each uri */
-       private final HashMap positionsByWordByURI = new HashMap(); /* String 
(URI) -> HashMap (String (word) -> Integer[] (Positions)) */
-
+//     private final HashMap lastPositionByURI = new HashMap(); /* String 
(URI) -> Integer */ /* Use to determine word position on each uri */
+       private final HashMap lastPositionById = new HashMap();
+//     private final HashMap positionsByWordByURI = new HashMap(); /* String 
(URI) -> HashMap (String (word) -> Integer[] (Positions)) */
+       private final HashMap positionsByWordById = new HashMap();
        // Can have many; this limit only exists to save memory.
        private static final int maxParallelRequests = 100;
        private int maxShownURIs = 15;
@@ -150,6 +159,11 @@
                if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
                        queuedURIList.addLast(uri);
                        visitedURIs.add(uri);
+                       uriIds.put(uri, id);
+                       idUris.put(id, uri);
+                       id++;
+                       
+                       //the page object of the client will contain the uri of 
the current page
                }
        }

@@ -180,7 +194,7 @@
                                        break;
                                FreenetURI uri = (FreenetURI) 
queuedURIList.removeFirst();
                                queuedURISet.remove(uri);
-                               if((uri.getKeyType()).equals("USK")){
+//                             if((uri.getKeyType()).equals("USK")){
 //                             if(uri.getSuggestedEdition() < 0)
 //                                     uri = uri.setSuggestedEdition((-1)* 
uri.getSuggestedEdition());
 //                             try{
@@ -189,7 +203,7 @@
 //                                     
 //                             }

-                               }
+       //                      }
                                ClientGetter getter = makeGetter(uri);
                                toStart.add(getter);
                                }
@@ -224,14 +238,7 @@

        public void onSuccess(FetchResult result, ClientGetter state) {
                FreenetURI uri = state.getURI();
-               try{
-           FileWriter output = new FileWriter("logfile",true);
-           output.write(uri.toString()+"\n");
-           output.close();
-               }
-               catch(Exception e){
-                       Logger.error(this, "The uri could not be removed from 
running "+e.toString(), e);
-               }
+
                synchronized (this) {
                        runningFetchesByURI.remove(uri);
                }
@@ -242,9 +249,21 @@

                sizeOfURIs.put(uri.toString(), new Long(data.size()));
                mimeOfURIs.put(uri.toString(), mimeType);
+               PageCallBack page = new PageCallBack();
+               page.id = (Integer) uriIds.get(uri);
+               inlinks.put(page.id, new Vector());
+               outlinks.put(page.id, new Vector());

+               try{
+           FileWriter output = new FileWriter("logfile",true);
+           output.write(uri.toString()+" page " + page.id +"\n");
+           output.close();
+               }
+               catch(Exception e){
+                       Logger.error(this, "The uri could not be removed from 
running "+e.toString(), e);
+               }
                try {
-                       ContentFilter.filter(data, ctx.bucketFactory, mimeType, 
uri.toURI("http://127.0.0.1:8888/";), this);
+                       ContentFilter.filter(data, ctx.bucketFactory, mimeType, 
uri.toURI("http://127.0.0.1:8888/";), page);
                } catch (UnsafeContentTypeException e) {
                        return; // Ignore
                } catch (IOException e) {
@@ -255,17 +274,17 @@
                        data.free();
                }
        }
-
+       
        public void onFailure(FetchException e, ClientGetter state) {
                FreenetURI uri = state.getURI();
-               try{
-                       FileWriter outp = new FileWriter("failed",true);
-                       outp.write("failed "+e.toString()+" for "+uri+'\n');
-                       outp.close();
-                       
-               }catch(Exception e2){
-                       
-               }
+//             try{
+//                     FileWriter outp = new FileWriter("failed",true);
+//                     outp.write("failed "+e.toString()+" for "+uri+'\n');
+//                     outp.close();
+//                     
+//             }catch(Exception e2){
+//                     
+//             }
                synchronized (this) {
                        runningFetchesByURI.remove(uri);
                        failedURIs.add(uri);
@@ -291,285 +310,7 @@
                // Ignore
        }

-       public void foundURI(FreenetURI uri) {
-               queueURI(uri);
-               startSomeRequests();
-       }
-
-       public void onText(String s, String type, URI baseURI) {
-               
-               FreenetURI uri;
-               try {
-                       uri = new FreenetURI(baseURI.getPath().substring(1));
-               } catch (MalformedURLException e) {
-                       Logger.error(this, "Caught " + e, e);
-                       return;
-               }
-                
-               
-      
-               if((type != null) && (type.length() != 0) && 
type.toLowerCase().equals("title")
-                  && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 
0)) {
-                       /* We should have a correct title */
-                       titlesOfURIs.put(uri.toString(), s);
-                       type = "title";
-                       
-               }
-               else type = null;
-
-
-               String[] words = s.split("[^A-Za-z0-9]");
-
-               Integer lastPosition = null;
-
-               lastPosition = (Integer)lastPositionByURI.get(uri.toString());
-
-               if(lastPosition == null)
-                       lastPosition = new Integer(1); /* We start to count 
from 1 */
-
-               for (int i = 0; i < words.length; i++) {
-                       String word = words[i];
-                       if ((word == null) || (word.length() == 0))
-                               continue;
-                       word = word.toLowerCase();
-                       try{
-                       if(type == null)
-                               addWord(word, lastPosition.intValue() + i, uri);
-                       else
-                               addWord(word, -1 * (i+1), uri);
-                       }
-                       catch (Exception e){}
-               }
-               
-               if(type == null) {
-                       lastPosition = new Integer(lastPosition.intValue() + 
words.length);
-                       lastPositionByURI.put(uri.toString(), lastPosition);
-               }
-               
-       }
-
-       private synchronized void addWord(String word, int position,FreenetURI 
uri) throws Exception{
-               
-               
-               if(word.length() < 3)
-                       return;
-               
-               //word = word.intern();
-
-
-               FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
-
-               //Integer[] positions = (Integer[]) 
positionsByWordByURI.get(word);
-
-               urisWithWords.add(uri);
-//     FileWriter outp = new FileWriter("uricheck",true);
-//     outp.write(uri.getDocName()+"\n");
-//     outp.write(uri.getKeyType()+"\n");
-//     outp.write(uri.getMetaString()+"\n");
-//     outp.write(uri.getGuessableKey()+"\n");
-//     outp.write(uri.hashCode()+"\n");
-//     outp.write(uri.getPreferredFilename()+"\n");
-//     
-//     outp.close();
-
-               /* Word position indexation */
-               HashMap wordPositionsForOneUri = 
(HashMap)positionsByWordByURI.get(uri.toString()); /* For a given URI, take as 
key a word, and gives position */
-               
-               if(wordPositionsForOneUri == null) {
-                       wordPositionsForOneUri = new HashMap();
-                       wordPositionsForOneUri.put(word, new Integer[] { new 
Integer(position) });
-                       positionsByWordByURI.put(uri.toString(), 
wordPositionsForOneUri);
-               } else {
-                       Integer[] positions = 
(Integer[])wordPositionsForOneUri.get(word);
-
-                       if(positions == null) {
-                               positions = new Integer[] { new 
Integer(position) };
-                               wordPositionsForOneUri.put(word, positions);
-                       } else {
-                               Integer[] newPositions = new 
Integer[positions.length + 1];
-
-                               System.arraycopy(positions, 0, newPositions, 0, 
positions.length);
-                               newPositions[positions.length] = new 
Integer(position);
-
-                               wordPositionsForOneUri.put(word, newPositions);
-                       }
-               }
-       
-               if (uris == null) {
-                       urisByWord.put(word, new FreenetURI[] { uri });
-                       
-               } else {
-                       for (int i = 0; i < uris.length; i++) {
-                               if (uris[i].equals(uri))
-                                       return;
-                       }
-                       FreenetURI[] newURIs = new FreenetURI[uris.length + 1];
-                       System.arraycopy(uris, 0, newURIs, 0, uris.length);
-                       newURIs[uris.length] = uri;
-                       urisByWord.put(word, newURIs);
-               }
-               //the new word is added here in urisByWord
-               tMap.put(MD5(word), word);
-               long time_indexing = System.currentTimeMillis();
-               if (tProducedIndex + minTimeBetweenEachIndexRewriting * 1000 < 
System.currentTimeMillis()) {
-                       try {
-                               //produceIndex();
-                               //check();
-                               
-                               if(indexing){
-                               generateIndex2();
-                               produceIndex2();
-                               if((System.currentTimeMillis() - 
time_indexing)/(System.currentTimeMillis() - tProducedIndex) > 
MAX_TIME_SPENT_INDEXING) indexing= false;
-                               else indexing = true;
-                               }
-                               
-                       } catch (IOException e) {
-                               Logger.error(this, "Caught " + e + " while 
creating index", e);
-                       }
-                       tProducedIndex = System.currentTimeMillis();
-               }
-               
-       }
-//     private synchronized void check() throws IOException{
-//             FileWriter outp = new FileWriter("logs/indexing",true);
-//             outp.write("size = "+urisByWord.size()+"\n");
-//             Iterator it = urisByWord.keySet().iterator();
-//             while(it.hasNext())
-//                     outp.write(it.next()+"\n");
-//             outp.close();
-//     }
-
-       private synchronized void produceIndex() throws 
IOException,NoSuchAlgorithmException {
-               // Produce the main index file.
-               
-               //the number of bits to consider for matching 
-               int prefix = 1 ;
-       
-               if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
-                       System.out.println("No URIs with words");
-                       return;
-               }
-               File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
-               StreamResult resultStream;
-               resultStream = new StreamResult(outputFile);
-
-               /* Initialize xml builder */
-               Document xmlDoc = null;
-               DocumentBuilderFactory xmlFactory = null;
-               DocumentBuilder xmlBuilder = null;
-               DOMImplementation impl = null;
-               Element rootElement = null;
-
-               xmlFactory = DocumentBuilderFactory.newInstance();
-
-
-               try {
-                       xmlBuilder = xmlFactory.newDocumentBuilder();
-               } catch(javax.xml.parsers.ParserConfigurationException e) {
-                       /* Will (should ?) never happen */
-                       Logger.error(this, "Spider: Error while initializing 
XML generator: "+e.toString());
-                       return;
-               }
-
-               impl = xmlBuilder.getDOMImplementation();
-               /* Starting to generate index */
-               xmlDoc = impl.createDocument(null, "main_index", null);
-               rootElement = xmlDoc.getDocumentElement();
-
-               /* Adding header to the index */
-               Element headerElement = xmlDoc.createElement("header");
-
-               /* -> title */
-               Element subHeaderElement = xmlDoc.createElement("title");
-               Text subHeaderText = xmlDoc.createTextNode(indexTitle);
-               
-               subHeaderElement.appendChild(subHeaderText);
-               headerElement.appendChild(subHeaderElement);
-
-               /* -> owner */
-               subHeaderElement = xmlDoc.createElement("owner");
-               subHeaderText = xmlDoc.createTextNode(indexOwner);
-               
-               subHeaderElement.appendChild(subHeaderText);
-               headerElement.appendChild(subHeaderElement);
-               
-               /* -> owner email */
-               if(indexOwnerEmail != null) {
-                       subHeaderElement = xmlDoc.createElement("email");
-                       subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
-                       
-                       subHeaderElement.appendChild(subHeaderText);
-                       headerElement.appendChild(subHeaderElement);
-               }
-
-               
-               //String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
-               //Arrays.sort(words);
-               FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new 
FreenetURI[urisWithWords.size()]);
-               urisToNumbers = new HashMap();
-               Element prefixElement = xmlDoc.createElement("prefix");
-               prefixElement.setAttribute("value", prefix+"");
-       
-
-               for (int i = 0; i < uris.length; i++) {
-                       urisToNumbers.put(uris[i], new Integer(i));
-                       }
-               
-               //all index files are ready
-               /* Adding word index */
-               Element keywordsElement = xmlDoc.createElement("keywords");
-               for(int i = 0;i<16;i++){
-                       
generateSubIndex(DEFAULT_INDEX_DIR+"index_"+Integer.toHexString(i)+".xml");
-                       Element subIndexElement = 
xmlDoc.createElement("subIndex");
-                       if(i<=9)
-                       subIndexElement.setAttribute("key",i+"");
-                       else
-                               
subIndexElement.setAttribute("key",Integer.toHexString(i));
-                       //the subindex element key will contain the bits used 
for matching in that subindex
-                       keywordsElement.appendChild(subIndexElement);
-               }
-               
-
-               // make sure that prefix is the first child of root Element
-               rootElement.appendChild(prefixElement);
-               rootElement.appendChild(headerElement);
-               
-               //rootElement.appendChild(filesElement);
-               rootElement.appendChild(keywordsElement);
-
-               /* Serialization */
-               DOMSource domSource = new DOMSource(xmlDoc);
-               TransformerFactory transformFactory = 
TransformerFactory.newInstance();
-               Transformer serializer;
-
-               try {
-                       serializer = transformFactory.newTransformer();
-               } catch(javax.xml.transform.TransformerConfigurationException 
e) {
-                       Logger.error(this, "Spider: Error while serializing XML 
(transformFactory.newTransformer()): "+e.toString());
-                       return;
-               }
-
-               serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
-               serializer.setOutputProperty(OutputKeys.INDENT,"yes");
-               
-               /* final step */
-               try {
-                       serializer.transform(domSource, resultStream);
-               } catch(javax.xml.transform.TransformerException e) {
-                       Logger.error(this, "Spider: Error while serializing XML 
(transform()): "+e.toString());
-                       return;
-               }
-
-               if(Logger.shouldLog(Logger.MINOR, this))
-                       Logger.minor(this, "Spider: indexes regenerated.");
-       
-       //the main xml file is generated 
-       //now as each word is generated enter it into the respective subindex
-       //now the parsing will start and nodes will be added as needed 
-               
-
-       }
-/**
+       /**
  * generates the main index file that can be used by librarian for searching 
in the list of
  * subindices
  *  
@@ -584,7 +325,12 @@
                //the number of bits to consider for matching 


-               if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+//             if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+//                     System.out.println("No URIs with words");
+//                     return;
+//             }
+               
+               if (idsByWord.isEmpty() || idsWithWords.isEmpty()) {
                        System.out.println("No URIs with words");
                        return;
                }
@@ -715,22 +461,26 @@
         */
        private synchronized void generateIndex2() throws Exception{
                // now we the tree map and we need to use the sorted (md5s) to 
generate the xml indices
-               if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+       
+               
+               if (idsByWord.isEmpty() || idsWithWords.isEmpty()) {
                        System.out.println("No URIs with words");
                        return;
                }
-               FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new 
FreenetURI[urisWithWords.size()]);
-               urisToNumbers = new HashMap();
-               for (int i = 0; i < uris.length; i++) {
-                       urisToNumbers.put(uris[i], new Integer(i));
-                       }
+               
+       //      FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new 
FreenetURI[urisWithWords.size()]);
+               Integer[] ids = (Integer[]) idsWithWords.toArray(new 
Integer[idsWithWords.size()]);
+//             urisToNumbers = new HashMap();
+//             for (int i = 0; i < uris.length; i++) {
+//                     urisToNumbers.put(uris[i], new Integer(i));
+//                     }
                indices = new Vector();
                int prefix = 1;
                match = 1;
                Vector list = new Vector();
                //String str = tMap.firstKey();
                Iterator it = tMap.keySet().iterator();
-               FileWriter outp = new FileWriter("indexing");
+                FileWriter outp = new FileWriter("indexing");
                outp.write("size = "+tMap.size()+"\n");
                outp.close();
                String str = (String) it.next();
@@ -797,9 +547,14 @@
        }       


-       private synchronized void generateXML(Vector list, int prefix)
+       private synchronized void generateXML (Vector list, int prefix) throws 
Exception
        {
+               FileWriter outp = new FileWriter("gen",true);
+               
+               
                String p = ((String) list.elementAt(0)).substring(0, prefix);
+               outp.write("inside gen xml + "+p+"\n");
+               
                indices.add(p);
                File outputFile = new File(DEFAULT_INDEX_DIR+"index_"+p+".xml");
                //indices.add(p);
@@ -847,7 +602,9 @@
                Element filesElement = xmlDoc.createElement("files"); /* 
filesElement != fileElement */

                Element EntriesElement = xmlDoc.createElement("entries");
+               
                EntriesElement.setNodeValue(list.size()+"");
+               outp.write("size = "+list.size()+"\n");
                EntriesElement.setAttribute("value", list.size()+"");
                //all index files are ready
                /* Adding word index */
@@ -858,13 +615,16 @@
                {
                        Element wordElement = xmlDoc.createElement("word");
                        String str = (String) tMap.get(list.elementAt(i));
+                       outp.write("word "+str+"\n");
                        wordElement.setAttribute("v",str );
-                       FreenetURI[] urisForWord = (FreenetURI[]) 
urisByWord.get(str);
+                       //FreenetURI[] urisForWord = (FreenetURI[]) 
urisByWord.get(str);
+                       Integer[] idsForWord = (Integer[]) idsByWord.get(str);
 //                     
-                       for (int j = 0; j < urisForWord.length; j++) {
-                               FreenetURI uri = urisForWord[j];
-                               Integer x = (Integer) urisToNumbers.get(uri);
-                               
+                       for (int j = 0; j < idsForWord.length; j++) {
+                               Integer id = idsForWord[j];
+                               //Integer x = (Integer) urisToNumbers.get(uri);
+                               Integer x = id;
+                               outp.write("x "+x+"\n");
                                if (x == null) {
                                        Logger.error(this, "Eh?");
                                        continue;
@@ -874,9 +634,12 @@
                                Element fileElement = 
xmlDoc.createElement("file");
                                uriElement.setAttribute("id", x.toString());
                                fileElement.setAttribute("id", x.toString());
-                               fileElement.setAttribute("key", uri.toString());
+                               //fileElement.setAttribute("key", 
uri.toString());
+                               outp.write("uri 
"+(idUris.get(id)).toString()+"\n");
+                               
fileElement.setAttribute("key",(idUris.get(id)).toString());
 ////                           /* Position by position */
-                               HashMap positionsForGivenWord = 
(HashMap)positionsByWordByURI.get(uri.toString());
+                               //HashMap positionsForGivenWord = 
(HashMap)positionsByWordByURI.get(uri.toString());
+                               HashMap positionsForGivenWord = 
(HashMap)positionsByWordById.get(x);
                                Integer[] positions = 
(Integer[])positionsForGivenWord.get(str);

                                StringBuffer positionList = new StringBuffer();
@@ -889,11 +652,11 @@
                                }

                                
uriElement.appendChild(xmlDoc.createTextNode(positionList.toString()));
-                               int l;
+                       
                                wordElement.appendChild(uriElement);
-                               if(!fileid.contains(x.toString()))
+                               if(!fileid.contains(x))
                                {
-                                       fileid.add(x.toString());
+                                       fileid.add(x);
                                        filesElement.appendChild(fileElement);
                                }
                        }
@@ -934,214 +697,10 @@

                if(Logger.shouldLog(Logger.MINOR, this))
                        Logger.minor(this, "Spider: indexes regenerated.");
-       
+               outp.close();
        }
-       private synchronized void generateIndex() throws Exception{
-               String[] words = (String[]) urisByWord.keySet().toArray(new 
String[urisByWord.size()]);
-               Arrays.sort(words);
-               for (int i = 0; i < words.length; i++) {
-               try{
-               
-               String prefix_match = getIndex(words[i]);

-               boolean addedWord = addWord(prefix_match,words[i]);
-
-               if(addedWord == false)
-                       {
-                       split(prefix_match);
-                       regenerateIndex(prefix_match);
-                       prefix_match = getIndex(words[i]);
-                       addWord(prefix_match,words[i]);
-                       }
-               }
-               catch(Exception e2){Logger.error(this,"The Word could not be 
added"+ e2.toString(), e2); }
-               }       
-
-       
-       }
-       private void regenerateIndex(String prefix) throws Exception{
-               //redistribute the entries in prefix.xml to prefix(0-f).xml
-               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
-               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
-               Document doc = 
docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
-               Element root = doc.getDocumentElement();
-               NodeList wordList = root.getElementsByTagName("word");
-               for(int i = 0;i<wordList.getLength();i++){
-                       Element word = (Element)wordList.item(i);
-                       String value = word.getAttribute("v");
-                       String prefix_match = getIndex(value);
-                       addWord(prefix_match,value);
-               }
-       }
-       
-       private String getIndex(String word) throws Exception {
-               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
-               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
-               Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
-               Element root = doc.getDocumentElement();
-               Attr prefix_value = (Attr) 
(root.getElementsByTagName("prefix").item(0)).getAttributes().getNamedItem("value");
-               int prefix = Integer.parseInt(prefix_value.getValue()); 
-               String md5 = MD5(word);
-               NodeList subindexList = root.getElementsByTagName("subIndex");
-               String str = md5.substring(0,prefix);           
-               String prefix_match = search(str,subindexList);

-               return prefix_match;
-       }
-       
-       private boolean addWord(String prefix, String str) throws Exception
-       {
-               //this word has to be added to the particular subindex
-               // modify the corresponding index
-               try{
-                       DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
-                       DocumentBuilder docBuilder = 
docFactory.newDocumentBuilder();
-                       Document doc = 
docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
-                       Element root = doc.getDocumentElement();
-                       
-                       Element entry = (Element) 
root.getElementsByTagName("entries").item(0);
-                       
-                       Attr no_entries = (Attr) 
entry.getAttributes().getNamedItem("value");
-                       
-                       Element filesElement = (Element) 
root.getElementsByTagName("files").item(0);
-                       NodeList filesList = 
filesElement.getElementsByTagName("file");
-                       if(Integer.parseInt(no_entries.getValue()) >= 
MAX_ENTRIES) return false;
-                       else
-                       {
-                       //increment the number of entries
-                       
entry.setAttribute("value",(Integer.parseInt(no_entries.getValue())+1)+"");
-                       //add the entry
-                       
-                       Element wordElement = doc.createElement("word");
-                       wordElement.setAttribute("v", str);
-
-                       FreenetURI[] urisForWord = (FreenetURI[]) 
urisByWord.get(str);
-
-                       /* URI by URI */
-                       for (int j = 0; j < urisForWord.length; j++) {
-                               FreenetURI uri = urisForWord[j];
-                               Integer x = (Integer) urisToNumbers.get(uri);
-                               
-                               if (x == null) {
-                                       Logger.error(this, "Eh?");
-                                       continue;
-                               }
-
-                               Element uriElement = doc.createElement("file");
-                               Element fileElement = doc.createElement("file");
-                               uriElement.setAttribute("id", x.toString());
-                               fileElement.setAttribute("id", x.toString());
-                               fileElement.setAttribute("key", uri.toString());
-//                             /* Position by position */
-                               HashMap positionsForGivenWord = 
(HashMap)positionsByWordByURI.get(uri.toString());
-                               Integer[] positions = 
(Integer[])positionsForGivenWord.get(str);
-
-                               StringBuffer positionList = new StringBuffer();
-
-                               for(int k=0; k < positions.length ; k++) {
-                                       if(k!=0)
-                                               positionList.append(',');
-
-                                       
positionList.append(positions[k].toString());
-                               }
-                               
-                               
uriElement.appendChild(doc.createTextNode(positionList.toString()));
-                               int l;
-                       for(l = 0;l<filesList.getLength();l++)
-                               { Element file = (Element) filesList.item(l);
-                               if(file.getAttribute("id").equals(x.toString()))
-                               
-                               break;
-                               }
-                               wordElement.appendChild(uriElement);
-                               if(l>=filesList.getLength())
-                               filesElement.appendChild(fileElement);
-                       }
-                       Element keywordsElement = (Element) 
root.getElementsByTagName("keywords").item(0);
-                       keywordsElement.appendChild(wordElement);
-               
-                       
-                       
-                       DOMSource domSource = new DOMSource(doc);
-                       TransformerFactory transformFactory = 
TransformerFactory.newInstance();
-                       Transformer serializer;
-
-                       
-                               serializer = transformFactory.newTransformer();
-                       
-                               
-                                               
-                       File outputFile = new 
File(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
-                       StreamResult resultStream;
-                       resultStream = new StreamResult(outputFile);
-
-                       serializer.setOutputProperty(OutputKeys.ENCODING, 
"UTF-8");
-                       serializer.setOutputProperty(OutputKeys.INDENT,"yes");
-                       
-                       /* final step */
-                       try {
-                               serializer.transform(domSource, resultStream);
-                       } catch(javax.xml.transform.TransformerException e) {}
-                       }
-                       
-                       return true;    
-               }
-               
-               catch(Exception e){Logger.error(this,"Word could not be added 
to the subindex"+ e.toString(), e);}
-               return false;
-       }
-       private void split(String prefix) throws Exception
-       {
-               //first we need to split the current subindex into 16 newones
-               //then read from the original one and append to the new ones
-               // make the entry in the main index..
-               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
-               DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
-               Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
-               Element root = doc.getDocumentElement();
-               Element prefixElt =(Element) 
root.getElementsByTagName("prefix").item(0);
-               int prefix_current = 
Integer.parseInt(prefixElt.getAttribute("value"));
-               if (prefix_current <= prefix.length())
-               prefixElt.setAttribute("value", (prefix_current+1)+"");
-               
-               Element keywordElement = (Element) 
root.getElementsByTagName("keywords").item(0);
-               
-               NodeList subIndexElt = root.getElementsByTagName("subIndex");
-               for(int i =0;i<subIndexElt.getLength();i++)
-               {
-                       Element subIndex = (Element) subIndexElt.item(i);
-                       if((subIndex.getAttribute("key")).equals(prefix)) {
-                               keywordElement.removeChild(subIndex);
-                               break;
-                       }
-               }
-               
-               for(int i = 0;i<16;i++)
-                       {
-                       Element subIndex = doc.createElement("subIndex");
-                       
generateSubIndex(DEFAULT_INDEX_DIR+"index_"+prefix+Integer.toHexString(i)+".xml");
-                       
subIndex.setAttribute("key",prefix.concat(Integer.toHexString(i)));
-                       keywordElement.appendChild(subIndex);
-                       }
-               
-               
-               DOMSource domSource = new DOMSource(doc);
-               TransformerFactory transformFactory = 
TransformerFactory.newInstance();
-               Transformer serializer;
-               serializer = transformFactory.newTransformer();
-               File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
-               StreamResult resultStream;
-               resultStream = new StreamResult(outputFile);
-
-               serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
-               serializer.setOutputProperty(OutputKeys.INDENT,"yes");
-               
-               /* final step */
-               try {
-                       serializer.transform(domSource, resultStream);
-               } catch(javax.xml.transform.TransformerException e) {}
-       }
-       
        public String search(String str,NodeList list) throws Exception
        {
                int prefix = str.length();
@@ -1319,17 +878,6 @@
         */
        public void startPlugin() {
                stopped = false;
-               
-//             Thread starterThread = new Thread("Spider Plugin Starter") {
-//                     public void run() {
-//                             try{
-//                                     Thread.sleep(30 * 1000); // Let the 
node start up
-//                             } catch (InterruptedException e){}
-//                             startSomeRequests();
-//                     }
-//             };
-//             starterThread.setDaemon(true);
-//             starterThread.start();
        }

        /**
@@ -1485,6 +1033,7 @@

 public void runPlugin(PluginRespirator pr){
        this.pr = pr;
+       this.id = 0;
        this.core = pr.getNode().clientCore;
        this.ctx = core.makeClient((short) 0).getFetchContext();
        ctx.maxSplitfileBlockRetries = 10;
@@ -1546,54 +1095,6 @@
                        out.append("<p>MalFormed URI: "+uriParam+"</p");
                }
                }
-       
-//     if(action == null || action.length() == 0){
-//             //put the default post fields
-//             appendDefaultPageStart(out,null);
-//             
-//     } else if ("list".equals(action)) {
-//             String listName = request.getParam("listName", null);
-//             out.append("<p>list clicked</CENTER></BODY></HTML>");
-//             if(listName == null){
-//                     //display all th
-//                     Set runningFetches = new 
HashMap(runningFetchesByURI).keySet();
-//                     List queued = new ArrayList(queuedURIList);
-//                     Set visited = new HashSet(visitedURIs);
-//                     Set failed = new HashSet(failedURIs);
-//                     
-//                     out.append("<p><h3>Running Fetches</h3>");
-//                     Iterator it=runningFetches.iterator();
-//                     while(it.hasNext()){
-//                             
out.append("<code>"+(it.next()).toString()+"</code><br>");
-//                     }
-//             }
-//             else{
-//                     //display individual results
-//             }
-//     }
-//     else if ("add".equals(action)) {
-//             String uriParam = request.getParam("key");
-//             try {
-//                     FreenetURI uri = new FreenetURI(uriParam);
-//                     synchronized (this) {
-//                             failedURIs.remove(uri);
-//                             visitedURIs.remove(uri);
-//                     }
-//                     queueURI(uri);
-//                     startSomeRequests();
-//             } catch (MalformedURLException mue1) {
-//                     out.append("<h1>URL invalid</h1>");
-////                   sendSimpleResponse(context, "URL invalid", "The given 
URI is not valid.");
-////                   return;
-//             }
-//             //not really necc
-////           MultiValueTable responseHeaders = new MultiValueTable();
-////           responseHeaders.put("Location", "?action=list");
-////           context.sendReplyHeaders(301, "Redirect", responseHeaders, 
"text/html; charset=utf-8", 0);
-//             
-//     
-//     }
-       
        return out.toString();
 }
 private void appendList(String listname, StringBuffer out, String stylesheet)
@@ -1679,6 +1180,191 @@
        }

 }
+
+public class PageCallBack implements FoundURICallback{
+       int id;
+               
+       PageCallBack(){
+               id = 0;
+       }
+       public void foundURI(FreenetURI uri){
+               //now we have the id of the page that had called this link
+               queueURI(uri);
+               int iduri = (Integer) uriIds.get(uri);
+               Vector outlink = (Vector) outlinks.get(id);
+               if(!outlink.contains(iduri))    
+                       outlink.add(iduri);
+               outlinks.remove(id);
+               outlinks.put(id, outlink);
+               try{
+               FileWriter out = new FileWriter("outlink",true);
+               out.write(" id "+id+" size "+ outlink.size()+" \n");
+               out.close();
+               }catch(Exception e){}
+
+               if(inlinks.containsKey(iduri)){
+                       Vector inlink = (Vector) inlinks.get(iduri);
+                       try{
+                               FileWriter out = new FileWriter("inlink",true);
+                               out.write(" id "+iduri+" size "+ 
inlink.size()+" \n");
+                               out.close();
+                               }catch(Exception e){}
+               
+                       if(!inlink.contains(id)) inlink.add(id);
+                       inlinks.remove(iduri);
+                       inlinks.put(iduri, inlink);
+                       
+               }
+               startSomeRequests();
+       }
+       public void onText(String s, String type, URI baseURI){
+               try{
+                       FileWriter outp = new FileWriter("ontext",true);
+                       outp.write("inside on text with id"+id+" \n");
+                       outp.close();
+               }catch(Exception e){}
+//             FreenetURI uri;
+//             try {
+//                     uri = new FreenetURI(baseURI.getPath().substring(1));
+//             } catch (MalformedURLException e) {
+//                     Logger.error(this, "Caught " + e, e);
+//                     return;
+//             }
+                
+               
+      
+               if((type != null) && (type.length() != 0) && 
type.toLowerCase().equals("title")
+                  && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 
0)) {
+                       /* We should have a correct title */
+               //      titlesOfURIs.put(uri.toString(), s);
+                       titlesOfIds.put(id, s);
+                       
+                       type = "title";
+                       
+               }
+               else type = null;
+
+
+               String[] words = s.split("[^A-Za-z0-9]");
+
+               Integer lastPosition = null;
+
+               //lastPosition = (Integer)lastPositionByURI.get(uri.toString());
+               lastPosition = (Integer)lastPositionById.get(id);
+               if(lastPosition == null)
+                       lastPosition = new Integer(1); /* We start to count 
from 1 */
+
+               for (int i = 0; i < words.length; i++) {
+                       String word = words[i];
+                       if ((word == null) || (word.length() == 0))
+                               continue;
+                       word = word.toLowerCase();
+                       try{
+                       if(type == null)
+                               addWord(word, lastPosition.intValue() + i, id);
+                       else
+                               addWord(word, -1 * (i+1), id);
+                       }
+                       catch (Exception e){}
+               }
+               
+               if(type == null) {
+                       lastPosition = new Integer(lastPosition.intValue() + 
words.length);
+               //      lastPositionByURI.put(uri.toString(), lastPosition);
+                       lastPositionById.put(id, lastPosition);
+               }
+               
+       }
+private synchronized void addWord(String word, int position,int id) throws 
Exception{
+               
+               
+               if(word.length() < 3)
+                       return;
+               
+               //word = word.intern();
+
+
+               //FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
+               Integer[] ids = (Integer[]) idsByWord.get(word);
+               
+       //      urisWithWords.add(uri);
+               idsWithWords.add(id);
+               try{
+                       FileWriter outp = new FileWriter("addWord",true);
+                       outp.write("ID ="+id+" uri ="+idUris.get(id)+"\n");
+                       outp.close();
+               }catch(Exception e){}
+//     FileWriter outp = new FileWriter("uricheck",true);
+//     outp.write(uri.getDocName()+"\n");
+//     outp.write(uri.getKeyType()+"\n");
+//     outp.write(uri.getMetaString()+"\n");
+//     outp.write(uri.getGuessableKey()+"\n");
+//     outp.write(uri.hashCode()+"\n");
+//     outp.write(uri.getPreferredFilename()+"\n");
+//     
+//     outp.close();
+
+               /* Word position indexation */
+               HashMap wordPositionsForOneUri = 
(HashMap)positionsByWordById.get(id); /* For a given URI, take as key a word, 
and gives position */
+               
+               if(wordPositionsForOneUri == null) {
+                       wordPositionsForOneUri = new HashMap();
+                       wordPositionsForOneUri.put(word, new Integer[] { new 
Integer(position) });
+                       //positionsByWordByURI.put(uri.toString(), 
wordPositionsForOneUri);
+                       positionsByWordById.put(id, wordPositionsForOneUri);
+               } else {
+                       Integer[] positions = 
(Integer[])wordPositionsForOneUri.get(word);
+
+                       if(positions == null) {
+                               positions = new Integer[] { new 
Integer(position) };
+                               wordPositionsForOneUri.put(word, positions);
+                       } else {
+                               Integer[] newPositions = new 
Integer[positions.length + 1];
+
+                               System.arraycopy(positions, 0, newPositions, 0, 
positions.length);
+                               newPositions[positions.length] = new 
Integer(position);
+
+                               wordPositionsForOneUri.put(word, newPositions);
+                       }
+               }
+       
+               if (ids == null) {
+                       idsByWord.put(word, new Integer[] { id });
+                       
+               } else {
+                       for (int i = 0; i < ids.length; i++) {
+                               if (ids[i].equals(id))
+                                       return;
+                       }
+                       Integer[] newIDs = new Integer[ids.length + 1];
+                       System.arraycopy(ids, 0, newIDs, 0, ids.length);
+                       newIDs[ids.length] = id;
+                       idsByWord.put(word, newIDs);
+               }
+               //the new word is added here in urisByWord
+               tMap.put(MD5(word), word);
+               long time_indexing = System.currentTimeMillis();
+               if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 < 
System.currentTimeMillis()) {
+                       try {
+                               //produceIndex();
+                               //check();
+                               
+                               if(indexing){
+                               generateIndex2();
+                               produceIndex2();
+                               if((System.currentTimeMillis() - 
time_indexing)/(System.currentTimeMillis() - tProducedIndex) > 
MAX_TIME_SPENT_INDEXING) indexing= false;
+                               else indexing = true;
+                               }
+                               
+                       } catch (IOException e) {
+                               Logger.error(this, "Caught " + e + " while 
creating index", e);
+                       }
+                       tProducedIndex = System.currentTimeMillis();
+               }
+               
+       }
+       
+}
 public String handleHTTPPut(HTTPRequest request) throws PluginHTTPException{
        return null;
 }

[freenet-cvs] r14723 - trunk/plugins/XMLSpider

Reply via email to