Author: cyberdo
Date: 2006-03-28 20:45:20 +0000 (Tue, 28 Mar 2006)
New Revision: 8345

Added:
   trunk/apps/LibrarianSpider/
   trunk/apps/LibrarianSpider/LibrarianCrawler.java
   trunk/apps/LibrarianSpider/README
Log:
The (really basic) spider...


Added: trunk/apps/LibrarianSpider/LibrarianCrawler.java
===================================================================
--- trunk/apps/LibrarianSpider/LibrarianCrawler.java    2006-03-28 19:42:39 UTC 
(rev 8344)
+++ trunk/apps/LibrarianSpider/LibrarianCrawler.java    2006-03-28 20:45:20 UTC 
(rev 8345)
@@ -0,0 +1,285 @@
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.InputStreamReader;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.Vector;
+
+import freenet.support.URLDecoder;
+import freenet.support.URLEncodedFormatException;
+
+
+public class LibrarianCrawler {
+       
+       private static String getLapTime(long starttime) {
+               long time = System.currentTimeMillis() - starttime;
+               time = time / 1000;
+               String ret = (time/3600) + ":";
+               time = time % 3600;
+               ret += (((time/60) < 10)?"0":"") + (time/60) + ":";
+               time = time % 60;
+               ret += ((time < 10)?"0":"") + time + ":";
+               return ret;
+       }
+       
+       public static void main(String[] args) throws Exception {
+               //URI u = new URI("http://localhost:8888/";);
+               long starttime = System.currentTimeMillis();
+               HashMap whm = new HashMap();
+               Vector uriOutList = new Vector();
+               LinkedList uriInList = new LinkedList();
+               int uriid = 0;
+               String urlsstring;
+               String muststartwith = "http://127.0.0.1:8888/";;
+               String muststartwith2 = "http://localhost:8888/";;
+               //uriq.add(new URIWrapper("http://localhost:8888/SSK at 
LvX5TuKWlL3dbbpwn-2NvQDa5s9YHia~EeHItgBCugA,24Av4ZiTyEYvGKFgH0I~mYEdRoNb9tHQbZO2-89H0~c,AQABAAE/gallery-001/"));
+               uriInList.add(new 
URIWrapper("http://wiki.freenetproject.org/FreenetTestPages";));
+               uriInList.add(new URIWrapper("http://127.0.0.1:8888/SSK at 
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/index-3/"));
+               uriInList.add(new URIWrapper("http://localhost:8888/SSK at 
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/index-4/"));
+               /*uriq.add(new URIWrapper(""));
+               uriq.add(new URIWrapper(""));
+               uriq.add(new URIWrapper(""));
+               */
+               BufferedWriter bw = new BufferedWriter(new FileWriter(new 
File("out.txt")));
+               while (uriInList.peek() != null) {
+                       try {
+                               if (uriInList.size() < 5)
+                                       Thread.sleep(800);
+                       } catch (Exception ess) { ;}
+                       System.err.println(getLapTime(starttime));
+                       System.err.println("uriInList-length: " + 
uriInList.size());
+                       System.err.println("uriOutList-length: " + 
uriOutList.size());
+                       
+                       urlsstring = "";
+                       URIWrapper currenturi = (URIWrapper)uriInList.poll();
+                       URI u = new URI(currenturi.uri);
+                       BufferedReader in;
+                       try {
+                               System.err.println("(" + currenturi.tries + ") 
" + u.toString());
+                               URL url = new URL(u.toString());
+                               URLConnection uc = url.openConnection();
+                               System.err.println(uc.getContentType() );
+                               System.err.println(uc.getContentLength() );
+                               //System.err.println(uc.getHeaderField() );
+                               if (!(uc.getContentType().startsWith("text/")))
+                                       continue;
+                               
+                               in = new BufferedReader(
+                                               new 
InputStreamReader(uc.getInputStream()));
+                       } catch (Exception e) {
+                               System.err.println("Exception: " + e);
+                               if (e instanceof NullPointerException)
+                                       e.printStackTrace();
+                               //e.printStackTrace();
+                               currenturi.tries++;
+                               if (currenturi.tries < 30)
+                                       uriInList.add(currenturi);
+                               try {
+                                       if (uriInList.size() < 5)
+                                               Thread.sleep(10000);
+                               } catch (Exception ess) { ;}
+                               continue;
+                       }
+                       
+                       
+                       String indata = "";
+                       String line;
+                       while ((line = in.readLine()) != null)
+                               indata += " " + line.trim();
+                       
+                       //System.out.println(indata);
+                       
+                       
//System.out.println("--------------------------------------");
+                       
+                       int hrefpos = 0;
+                       String href = "";
+                       while ((hrefpos = indata.indexOf("href=\"", hrefpos + 
1)) > 0) {
+                               href = indata.substring(hrefpos + 6, 
indata.indexOf("\"", hrefpos+7));
+                               urlsstring += u.resolve(href).getPath();
+                               //if (!href.endsWith("htm") && 
!href.endsWith("html") && !href.endsWith("/") && !href.endsWith("txt")) {
+                               String h = href.toLowerCase();
+                               if (h.endsWith("pg") || h.endsWith("peg") || 
h.endsWith("gif") || h.endsWith("mp3") || h.endsWith("avi") || 
h.endsWith("css")) {
+                                                       
;//System.err.println("not adding: " + href);
+                               } else  if 
((u.resolve(href).toString().startsWith(muststartwith)) || 
(u.resolve(href).toString().startsWith(muststartwith2))){
+                                       URIWrapper uw = new 
URIWrapper(u.resolve(href).toString());
+                                       if (!uw.equals(currenturi))
+                                               if (!uriInList.contains(uw))
+                                                       if 
(!uriOutList.contains(uw)) {
+                                                               
uriInList.add(new URIWrapper(u.resolve(href).toString()));
+                                                               
//System.err.print(".");
+                                                       }       
+                               } else
+                                       //System.err.println("External link: "+ 
u.resolve(href));
+                                       ;
+                       }
+                       //currenturi.descr = 
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>.*</[tT][iI][tT][lL][eE]>", "\\1");
+                       currenturi.descr = 
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>", 
"").replaceAll("</[tT][iI][tT][lL][eE]>.*", "");
+                       if 
((u.resolve(href).toString().startsWith(muststartwith)) || 
(u.resolve(href).toString().startsWith(muststartwith2))){
+                               indata = indata.toLowerCase();
+                               indata = indata.replaceAll(".*<body[^>]*>", "");
+                               indata = indata.replaceAll("</body[^>]*", "");
+                               indata = indata.replaceAll("<href=\".[^\"]*\"", 
"><");
+                               indata = indata.replaceAll("<[^>]*>", "");
+                               //System.out.println(indata);
+                               
System.out.println("--------------------------------------");
+                               
+                               String outdata = "";
+                               char ch[] = new char[1];
+                               boolean whitespace = true;
+                               for (int i = 0 ; i < indata.length() ; i++) {
+                                       ch[0] = indata.charAt(i);
+                                       if 
(Character.isJavaIdentifierPart(ch[0])) {
+                                               outdata += new String(ch);
+                                               whitespace = false;
+                                       } else {
+                                               if (!whitespace) {
+                                                       outdata += " ";
+                                                       whitespace = true;
+                                               }
+                                       }
+                               }
+                               System.out.println(outdata);
+                               
+                               String words[] = outdata.split(" ");
+                               for (int i = 0 ; i < words.length ; i++) {
+                                       if (!whm.containsKey(words[i]))
+                                               whm.put(words[i], new 
HashMap());
+                                       HashMap khm = 
(HashMap)whm.get(words[i]);
+                                       if (!khm.containsKey(new 
Integer(uriid)))
+                                               khm.put(new Integer(uriid), new 
Vector());
+                                       Vector wv = (Vector)khm.get(new 
Integer(uriid));
+                                       wv.add(new Integer(i));
+                               }
+                               
+                               words = 
urlsstring.toLowerCase().split("[^[a-zA-Z1-90]]");
+                               for (int i = 0 ; i < words.length ; i++) {
+                                       if (!whm.containsKey(words[i]))
+                                               whm.put(words[i], new 
HashMap());
+                                       HashMap khm = 
(HashMap)whm.get(words[i]);
+                                       if (!khm.containsKey(new 
Integer(uriid)))
+                                               khm.put(new Integer(uriid), new 
Vector());
+                                       Vector wv = (Vector)khm.get(new 
Integer(uriid));
+                                       wv.add(new Integer(-1));
+                               }
+                               
+                               uriOutList.add(currenturi);
+                               uriid++;
+                       }
+               }
+               
+               
+               
+               
System.err.println("===============================================");
+               
System.err.println("===============================================");
+               
System.err.println("===============================================");
+               
System.err.println("===============================================");
+               for (int i = 0 ; i < uriOutList.size() ; i++)
+                       bw.write(((URIWrapper)uriOutList.get(i)).toEntry() + 
"\n");
+               
+               
+               Iterator itw = whm.keySet().iterator();
+               while (itw.hasNext()) {
+                       String word = (String)itw.next();
+                       System.err.print("?"+word);
+                       bw.write("?"+word );
+                       HashMap khm = (HashMap)whm.get(word);
+                       Iterator itk = khm.keySet().iterator();
+                       while (itk.hasNext()) {
+                               Integer keyid = (Integer)itk.next();
+                               bw.write(" " + keyid);
+                               //System.err.print(" " + keyid);
+                               //Vector wv = (Vector)khm.get(keyid);
+                               //for (int i = 0 ; i < wv.size() ; i++)
+                               //System.err.print(((i == 0)?"=":",") + 
wv.get(i));
+                       }
+                       System.err.println();
+                       bw.write("\n");
+               }
+               
+               bw.close();
+               
+               
+               /*
+                u.resolve("/aaa");
+                System.err.println(u);
+                System.err.println(u.resolve("/aaa"));
+                */
+               /*
+                BucketFactory bf = new ArrayBucketFactory();
+                SaferFilter sf = new SaferFilter("", bf);
+                Bucket bucket = new bf.makeBucket(uc.getContentLength());
+                */
+               //sf.run(bucket, "ss")
+               
+               
+               // freenet.client.http.filter.SaferFilter
+       }
+       
+       private static class URIWrapper implements Comparable {
+               public String uri;
+               public int tries = 0;
+               public String descr = null;
+               
+               public URIWrapper(String uri) {
+                       this.uri = uri;
+               }
+               
+               public int compareTo(Object o) {
+                       if (!(o instanceof URIWrapper))
+                               return -1;
+                       
+                       return 
((URIWrapper)o).mkShortURI().compareTo(mkShortURI());
+               }
+               
+               public boolean equals(Object o) {
+                       if (!(o instanceof URIWrapper))
+                               return false;
+                       
+                       return 
((URIWrapper)o).mkShortURI().equals(mkShortURI());
+               }
+               
+               public String mkShortURI() {
+                       String suri;
+                       try {
+                               URI u = new URI(uri);
+                               suri = new URI("http://127.0.0.1";,
+                                               "", "", u.getPort(),
+                                               
u.getPath().replaceAll("^freenet:", ""), u.getQuery(),
+                               "").toString(); 
+                       } catch (URISyntaxException e) {
+                               e.printStackTrace();
+                               return null;
+                       }
+                       if ((suri.indexOf("@") < 0) || (suri.indexOf("@") > 
30)) {
+                               try {
+                                       suri = URLDecoder.decode(suri);
+                               } catch (URLEncodedFormatException e) {
+                                       // TODO Auto-generated catch block
+                                       e.printStackTrace();
+                               }
+                       }
+                       return suri;
+               }
+               
+               public String toEntry() {
+                       URI u = null;
+                       try {
+                               u = new URI(uri);
+                       } catch (URISyntaxException e) {
+                               // TODO Auto-generated catch block
+                               //e.printStackTrace();
+                               return "![ERROR]";
+                       }
+                       
+                       return ("!"+ u.getPath() + 
((descr==null)?"":("\n+"+descr)));
+               }
+       }
+       
+}

Added: trunk/apps/LibrarianSpider/README
===================================================================
--- trunk/apps/LibrarianSpider/README   2006-03-28 19:42:39 UTC (rev 8344)
+++ trunk/apps/LibrarianSpider/README   2006-03-28 20:45:20 UTC (rev 8345)
@@ -0,0 +1,21 @@
+Quick and dirty:
+
+Notes:
+
+All spidering is done via fproxy
+Description is currently the page title
+
+1:
+Add starting points to the file "LibrarianCrawler.java". Akl pages given will
+be tried, and after that, only links going to "localhost" or "127.0.0.1"
+
+2:
+Compile with:
+cyberdo at storm ~/LibrarianSpider $ javac -cp freenet-stable-latest.jar:. 
LibrarianCrawler.java 
+
+3:
+Run with:
+cyberdo at storm ~/LibrarianSpider $ java -cp freenet-stable-latest.jar:. 
LibrarianCrawler
+
+4:
+Upload "out.txt" to the network.. it's the new index


Reply via email to