Author: nextgens
Date: 2006-05-17 15:18:45 +0000 (Wed, 17 May 2006)
New Revision: 8743
Modified:
trunk/apps/LibrarianSpider/LibrarianCrawler.java
Log:
reverting r8741~, my bad
Modified: trunk/apps/LibrarianSpider/LibrarianCrawler.java
===================================================================
--- trunk/apps/LibrarianSpider/LibrarianCrawler.java 2006-05-17 15:12:26 UTC
(rev 8742)
+++ trunk/apps/LibrarianSpider/LibrarianCrawler.java 2006-05-17 15:18:45 UTC
(rev 8743)
@@ -39,7 +39,14 @@
String urlsstring;
String muststartwith = "http://127.0.0.1:8888/";
String muststartwith2 = "http://localhost:8888/";
- uriInList.add(new URIWrapper("http://localhost:8888/USK at
BPZppy07RyID~NGihHgs4AAw3fUXxgtKIrwRu5rtpWE,k5yjkAFJC93JkydKl6vpY0Zy9D8ec1ymv2XP4Tx5Io0,AQABAAE/FreeHoo/5/"));
+ //uriq.add(new URIWrapper("http://localhost:8888/SSK at
LvX5TuKWlL3dbbpwn-2NvQDa5s9YHia~EeHItgBCugA,24Av4ZiTyEYvGKFgH0I~mYEdRoNb9tHQbZO2-89H0~c,AQABAAE/gallery-001/"));
+ uriInList.add(new
URIWrapper("http://wiki.freenetproject.org/FreenetTestPages"));
+ uriInList.add(new URIWrapper("http://127.0.0.1:8888/SSK at
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/index-3/"));
+ uriInList.add(new URIWrapper("http://localhost:8888/SSK at
60I8H8HinpgZSOuTSD66AVlIFAy-xsppFr0YCzCar7c,NzdivUGCGOdlgngOGRbbKDNfSCnjI0FXjHLzJM4xkJ4,AQABAAE/index-4/"));
+ /*uriq.add(new URIWrapper(""));
+ uriq.add(new URIWrapper(""));
+ uriq.add(new URIWrapper(""));
+ */
BufferedWriter bw = new BufferedWriter(new FileWriter(new
File("out.txt")));
while (uriInList.peek() != null) {
try {
@@ -95,8 +102,6 @@
String href = "";
while ((hrefpos = indata.indexOf("href=\"", hrefpos +
1)) > 0) {
href = indata.substring(hrefpos + 6,
indata.indexOf("\"", hrefpos+7));
- if(href.charAt(0)=='\"') href="invalid";
- try{
urlsstring += u.resolve(href).getPath();
//if (!href.endsWith("htm") &&
!href.endsWith("html") && !href.endsWith("/") && !href.endsWith("txt")) {
String h = href.toLowerCase();
@@ -105,20 +110,20 @@
} else if
((u.resolve(href).toString().startsWith(muststartwith)) ||
(u.resolve(href).toString().startsWith(muststartwith2))){
URIWrapper uw = new
URIWrapper(u.resolve(href).toString());
if (!uw.equals(currenturi))
+ if (!uriInList.contains(uw))
if
(!uriOutList.contains(uw)) {
uriInList.add(new URIWrapper(u.resolve(href).toString()));
//System.err.print(".");
}
- }
- }catch (Exception e){
- }
+ } else
+ //System.err.println("External link: "+
u.resolve(href));
+ ;
}
- System.out.println(indata);
//currenturi.descr =
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>.*</[tT][iI][tT][lL][eE]>", "\\1");
- //currenturi.descr =
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>",
"").replaceAll("</[tT][iI][tT][lL][eE]>.*", "");
+ currenturi.descr =
indata.replaceAll(".*<[tT][iI][tT][lL][eE]>",
"").replaceAll("</[tT][iI][tT][lL][eE]>.*", "");
if
((u.resolve(href).toString().startsWith(muststartwith)) ||
(u.resolve(href).toString().startsWith(muststartwith2))){
indata = indata.toLowerCase();
- // indata = indata.replaceAll(".*<body[^>]*>", "");
+ indata = indata.replaceAll(".*<body[^>]*>", "");
indata = indata.replaceAll("</body[^>]*", "");
indata = indata.replaceAll("<href=\".[^\"]*\"",
"><");
indata = indata.replaceAll("<[^>]*>", "");