Author: nextgens
Date: 2006-05-17 15:12:08 +0000 (Wed, 17 May 2006)
New Revision: 8741

Modified:
   trunk/apps/LibrarianSpider/LibrarianCrawler.java
Log:
plugins:

fix LibrarianCrawler

Modified: trunk/apps/LibrarianSpider/LibrarianCrawler.java
===================================================================
--- trunk/apps/LibrarianSpider/LibrarianCrawler.java    2006-05-17 14:48:49 UTC 
(rev 8740)
+++ trunk/apps/LibrarianSpider/LibrarianCrawler.java    2006-05-17 15:12:08 UTC 
(rev 8741)
@@ -39,14 +39,7 @@
                String urlsstring;
                String muststartwith = "http://127.0.0.1:8888/";;
                String muststartwith2 = "http://localhost:8888/";;
-               //uriq.add(new URIWrapper("http://localhost:8888/SSK at 
LvX5TuKWlL3dbbpwn-2NvQDa5s9YHia~EeHItgBCugA,24Av4ZiTyEYvGKFgH0I~mYEdRoNb9tHQbZO2-89H0~c,AQABAAE/gallery-001/"));
-               uriInList.add(new 
URIWrapper("http://wiki.freenetproject.org/FreenetTestPages";));
-               uriInList.add(new URIWrapper("http://127.0.0.1:8888/SSK at 
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/index-3/"));
-               uriInList.add(new URIWrapper("http://localhost:8888/SSK at 
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/index-4/"));
-               /*uriq.add(new URIWrapper(""));
-               uriq.add(new URIWrapper(""));
-               uriq.add(new URIWrapper(""));
-               */
+               uriInList.add(new URIWrapper("http://localhost:8888/USK at 
BPZppy07RyID~NGihHgs4AAw3fUXxgtKIrwRu5rtpWE,k5yjkAFJC93JkydKl6vpY0Zy9D8ec1ymv2XP4Tx5Io0,AQABAAE/FreeHoo/5/"));
                BufferedWriter bw = new BufferedWriter(new FileWriter(new 
File("out.txt")));
                while (uriInList.peek() != null) {
                        try {
@@ -102,6 +95,8 @@
                        String href = "";
                        while ((hrefpos = indata.indexOf("href=\"", hrefpos + 
1)) > 0) {
                                href = indata.substring(hrefpos + 6, 
indata.indexOf("\"", hrefpos+7));
+                               if(href.charAt(0)=='\"') href="invalid";
+                               try{
                                urlsstring += u.resolve(href).getPath();
                                //if (!href.endsWith("htm") && 
!href.endsWith("html") && !href.endsWith("/") && !href.endsWith("txt")) {
                                String h = href.toLowerCase();
@@ -110,20 +105,20 @@
                                } else  if 
((u.resolve(href).toString().startsWith(muststartwith)) || 
(u.resolve(href).toString().startsWith(muststartwith2))){
                                        URIWrapper uw = new 
URIWrapper(u.resolve(href).toString());
                                        if (!uw.equals(currenturi))
-                                               if (!uriInList.contains(uw))
                                                        if 
(!uriOutList.contains(uw)) {
                                                                
uriInList.add(new URIWrapper(u.resolve(href).toString()));
                                                                
//System.err.print(".");
                                                        }       
-                               } else
-                                       //System.err.println("External link: "+ 
u.resolve(href));
-                                       ;
+                               } 
+                               }catch (Exception e){
+                               }
                        }
+                       System.out.println(indata);
                        //currenturi.descr = 
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>.*</[tT][iI][tT][lL][eE]>", "\\1");
-                       currenturi.descr = 
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>", 
"").replaceAll("</[tT][iI][tT][lL][eE]>.*", "");
+                       //currenturi.descr = 
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>", 
"").replaceAll("</[tT][iI][tT][lL][eE]>.*", "");
                        if 
((u.resolve(href).toString().startsWith(muststartwith)) || 
(u.resolve(href).toString().startsWith(muststartwith2))){
                                indata = indata.toLowerCase();
-                               indata = indata.replaceAll(".*<body[^>]*>", "");
+               //              indata = indata.replaceAll(".*<body[^>]*>", "");
                                indata = indata.replaceAll("</body[^>]*", "");
                                indata = indata.replaceAll("<href=\".[^\"]*\"", 
"><");
                                indata = indata.replaceAll("<[^>]*>", "");


Reply via email to