Author: theli
Date: 2006-03-09 13:19:46 +0100 (Thu, 09 Mar 2006)
New Revision: 1867
Modified:
trunk/source/de/anomic/data/robotsParser.java
Log:
*) better robots.txt support
- previously rules for all crawlers and special rules for yacy where
combined using AND. Now the general rule will be ignored if there is
a special rule for yacy (according to rfc)
Modified: trunk/source/de/anomic/data/robotsParser.java
===================================================================
--- trunk/source/de/anomic/data/robotsParser.java 2006-03-09 11:31:17 UTC
(rev 1866)
+++ trunk/source/de/anomic/data/robotsParser.java 2006-03-09 12:19:46 UTC
(rev 1867)
@@ -95,59 +95,61 @@
}
public static ArrayList parse(BufferedReader reader) throws IOException{
- ArrayList deny = new ArrayList();
+ ArrayList deny4AllAgents = new ArrayList();
+ ArrayList deny4YaCyAgent = new ArrayList();
int pos;
String line = null, lineUpper = null;
- boolean rule4Yacy = false, inBlock = false;
+ boolean isRuleBlock4AllAgents = false,
+ isRuleBlock4YaCyAgent = false,
+ rule4YaCyFound = false,
+ inBlock = false;
while ((line = reader.readLine()) != null) {
line = line.trim();
lineUpper = line.toUpperCase();
if (line.length() == 0) {
- // we have reached the end of the rule block
- rule4Yacy = false; inBlock = false;
+ // OLD: we have reached the end of the rule block
+ // rule4Yacy = false; inBlock = false;
+
+ // NEW: just ignore it
} else if (line.startsWith("#")) {
// we can ignore this. Just a comment line
} else if (lineUpper.startsWith("User-agent:".toUpperCase())) {
if (inBlock) {
+ // we have detected the start of a new block
inBlock = false;
- rule4Yacy = false;
+ isRuleBlock4AllAgents = false;
+ isRuleBlock4YaCyAgent = false;
}
- if (!rule4Yacy) {
- // cutting off comments at the line end
- pos = line.indexOf("#");
- if (pos != -1) {
- line = line.substring(0,pos).trim();
- }
-
- // replacing all tabs with spaces
- line = line.replaceAll("\t"," ");
-
- // getting out the robots name
- pos = line.indexOf(" ");
- if (pos != -1) {
- String userAgent = line.substring(pos).trim();
- rule4Yacy = (userAgent.equals("*") ||
(userAgent.toLowerCase().indexOf("yacy") >=0));
- }
+ // cutting off comments at the line end
+ pos = line.indexOf("#");
+ if (pos != -1) line = line.substring(0,pos).trim();
+
+ // replacing all tabs with spaces
+ line = line.replaceAll("\t"," ");
+
+ // getting out the robots name
+ pos = line.indexOf(" ");
+ if (pos != -1) {
+ String userAgent = line.substring(pos).trim();
+ isRuleBlock4AllAgents |= userAgent.equals("*");
+ isRuleBlock4YaCyAgent |=
userAgent.toLowerCase().indexOf("yacy") >=0;
+ if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
}
} else if (lineUpper.startsWith("Disallow:".toUpperCase())) {
inBlock = true;
- if (rule4Yacy) {
+ if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
// cutting off comments at the line end
pos = line.indexOf("#");
- if (pos != -1) {
- line = line.substring(0,pos).trim();
- }
+ if (pos != -1) line = line.substring(0,pos).trim();
// cutting of tailing *
- if (line.endsWith("*")) {
- line = line.substring(0,line.length()-1);
- }
+ if (line.endsWith("*")) line =
line.substring(0,line.length()-1);
// replacing all tabs with spaces
line = line.replaceAll("\t"," ");
@@ -172,13 +174,14 @@
path = path.replaceAll(";","%3B");
// adding it to the pathlist
- deny.add(path);
+ if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
+ if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
}
}
}
}
- return deny;
+ return (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
}
public static boolean isDisallowed(URL nexturl) {
@@ -293,11 +296,16 @@
// if we previously have downloaded this robots.txt then we can
set the if-modified-since header
httpHeader reqHeaders = new httpHeader();
+
+ // adding referer
+ reqHeaders.put(httpHeader.REFERER, (new
URL(robotsURL,"/")).toString());
+
if (entry != null) {
oldEtag = entry.getETag();
reqHeaders = new httpHeader();
Date modDate = entry.getModDate();
if (modDate != null)
reqHeaders.put(httpHeader.IF_MODIFIED_SINCE,httpc.dateString(entry.getModDate()));
+
}
httpc.response res = con.GET(robotsURL.getFile(), reqHeaders);
_______________________________________________
YaCy-svn mailing list
[email protected]
http://lists.berlios.de/mailman/listinfo/yacy-svn