Author: nextgens
Date: 2006-05-17 15:12:08 +0000 (Wed, 17 May 2006)
New Revision: 8741
Modified:
trunk/apps/LibrarianSpider/LibrarianCrawler.java
Log:
plugins:
fix LibrarianCrawler
Modified: trunk/apps/LibrarianSpider/LibrarianCrawler.java
===================================================================
--- trunk/apps/LibrarianSpider/LibrarianCrawler.java 2006-05-17 14:48:49 UTC
(rev 8740)
+++ trunk/apps/LibrarianSpider/LibrarianCrawler.java 2006-05-17 15:12:08 UTC
(rev 8741)
@@ -39,14 +39,7 @@
String urlsstring;
String muststartwith = "http://127.0.0.1:8888/";
String muststartwith2 = "http://localhost:8888/";
- //uriq.add(new URIWrapper("http://localhost:8888/SSK at
LvX5TuKWlL3dbbpwn-2NvQDa5s9YHia~EeHItgBCugA,24Av4ZiTyEYvGKFgH0I~mYEdRoNb9tHQbZO2-89H0~c,AQABAAE/gallery-001/"));
- uriInList.add(new
URIWrapper("http://wiki.freenetproject.org/FreenetTestPages"));
- uriInList.add(new URIWrapper("http://127.0.0.1:8888/SSK at
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/index-3/"));
- uriInList.add(new URIWrapper("http://localhost:8888/SSK at
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/index-4/"));
- /*uriq.add(new URIWrapper(""));
- uriq.add(new URIWrapper(""));
- uriq.add(new URIWrapper(""));
- */
+ uriInList.add(new URIWrapper("http://localhost:8888/USK at
BPZppy07RyID~NGihHgs4AAw3fUXxgtKIrwRu5rtpWE,k5yjkAFJC93JkydKl6vpY0Zy9D8ec1ymv2XP4Tx5Io0,AQABAAE/FreeHoo/5/"));
BufferedWriter bw = new BufferedWriter(new FileWriter(new
File("out.txt")));
while (uriInList.peek() != null) {
try {
@@ -102,6 +95,8 @@
String href = "";
while ((hrefpos = indata.indexOf("href=\"", hrefpos +
1)) > 0) {
href = indata.substring(hrefpos + 6,
indata.indexOf("\"", hrefpos+7));
+ if(href.charAt(0)=='\"') href="invalid";
+ try{
urlsstring += u.resolve(href).getPath();
//if (!href.endsWith("htm") &&
!href.endsWith("html") && !href.endsWith("/") && !href.endsWith("txt")) {
String h = href.toLowerCase();
@@ -110,20 +105,20 @@
} else if
((u.resolve(href).toString().startsWith(muststartwith)) ||
(u.resolve(href).toString().startsWith(muststartwith2))){
URIWrapper uw = new
URIWrapper(u.resolve(href).toString());
if (!uw.equals(currenturi))
- if (!uriInList.contains(uw))
if
(!uriOutList.contains(uw)) {
uriInList.add(new URIWrapper(u.resolve(href).toString()));
//System.err.print(".");
}
- } else
- //System.err.println("External link: "+
u.resolve(href));
- ;
+ }
+ }catch (Exception e){
+ }
}
+ System.out.println(indata);
//currenturi.descr =
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>.*</[tT][iI][tT][lL][eE]>", "\\1");
- currenturi.descr =
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>",
"").replaceAll("</[tT][iI][tT][lL][eE]>.*", "");
+ //currenturi.descr =
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>",
"").replaceAll("</[tT][iI][tT][lL][eE]>.*", "");
if
((u.resolve(href).toString().startsWith(muststartwith)) ||
(u.resolve(href).toString().startsWith(muststartwith2))){
indata = indata.toLowerCase();
- indata = indata.replaceAll(".*<body[^>]*>", "");
+ // indata = indata.replaceAll(".*<body[^>]*>", "");
indata = indata.replaceAll("</body[^>]*", "");
indata = indata.replaceAll("<href=\".[^\"]*\"",
"><");
indata = indata.replaceAll("<[^>]*>", "");