Author: theli
Date: 2006-03-09 15:03:54 +0100 (Thu, 09 Mar 2006)
New Revision: 1872
Modified:
trunk/source/de/anomic/data/robotsParser.java
trunk/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java
Log:
*) Adding support for robots Allow directive
Modified: trunk/source/de/anomic/data/robotsParser.java
===================================================================
--- trunk/source/de/anomic/data/robotsParser.java 2006-03-09 14:03:24 UTC
(rev 1871)
+++ trunk/source/de/anomic/data/robotsParser.java 2006-03-09 14:03:54 UTC
(rev 1872)
@@ -75,6 +75,11 @@
*/
public final class robotsParser{
+ public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
+ public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
+ public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
+ public static final String ROBOTS_COMMENT = "#";
+
/*public robotsParser(URL robotsUrl){
}*/
/*
@@ -119,9 +124,9 @@
// rule4Yacy = false; inBlock = false;
// NEW: just ignore it
- } else if (line.startsWith("#")) {
+ } else if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
- } else if (lineUpper.startsWith("User-agent:".toUpperCase())) {
+ } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) {
// we have detected the start of a new block
@@ -131,7 +136,7 @@
}
// cutting off comments at the line end
- pos = line.indexOf("#");
+ pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// replacing all tabs with spaces
@@ -145,12 +150,14 @@
isRuleBlock4YaCyAgent |=
userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
}
- } else if (lineUpper.startsWith("Disallow:".toUpperCase())) {
+ } else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
+ lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true;
+ boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
// cutting off comments at the line end
- pos = line.indexOf("#");
+ pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// cutting of tailing *
@@ -176,9 +183,10 @@
}
// escaping all occurences of ; because this char is
used as special char in the Robots DB
- path = path.replaceAll(";","%3B");
+ path =
path.replaceAll(plasmaCrawlRobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
// adding it to the pathlist
+ if (!isDisallowRule) path = "!" + path;
if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
}
Modified: trunk/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java 2006-03-09
14:03:24 UTC (rev 1871)
+++ trunk/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java 2006-03-09
14:03:54 UTC (rev 1872)
@@ -59,6 +59,9 @@
import de.anomic.kelondro.kelondroException;
public class plasmaCrawlRobotsTxt {
+
+ public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
+
kelondroMap robotsTable;
private final File robotsTableFile;
private int bufferkb;
@@ -168,7 +171,7 @@
this.disallowPathList = new LinkedList();
String csPl = (String) this.mem.get(DISALLOW_PATH_LIST);
if (csPl.length() > 0){
- String[] pathArray = csPl.split(";");
+ String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.disallowPathList.addAll(Arrays.asList(pathArray));
}
@@ -200,7 +203,7 @@
StringBuffer pathListStr = new StringBuffer();
for (int i=0; i<disallowPathList.size();i++) {
pathListStr.append(disallowPathList.get(i))
- .append(";");
+ .append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
}
@@ -246,13 +249,26 @@
}
public boolean isDisallowed(String path) {
- if ((this.mem == null) || (this.disallowPathList.size() == 0))
return false;
- if ((path == null) || (path.length() == 0)) path = "/";
+ if ((this.mem == null) || (this.disallowPathList.size() == 0))
return false;
+ // if the path is null or empty we set it to /
+ if ((path == null) || (path.length() == 0)) path = "/";
+ // escaping all occurences of ; because this char is used as
special char in the Robots DB
+ else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
+
+
Iterator pathIter = this.disallowPathList.iterator();
while (pathIter.hasNext()) {
String nextPath = (String) pathIter.next();
- if (path.startsWith(nextPath)) return true;
+ // allow rule
+ if (nextPath.startsWith("!") && nextPath.length() > 1 &&
path.startsWith(nextPath.substring(1))) {
+ return false;
+ }
+
+ // disallow rule
+ if (path.startsWith(nextPath)) {
+ return true;
+ }
}
return false;
}
_______________________________________________
YaCy-svn mailing list
[email protected]
http://lists.berlios.de/mailman/listinfo/yacy-svn