Author: jerome
Date: Mon Jun 5 14:43:42 2006
New Revision: 411926
URL: http://svn.apache.org/viewvc?rev=411926&view=rev
Log:
NUTCH-298 : No more NPE if a 404 for a robots.txt + some unit tests
Modified:
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
Modified:
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=411926&r1=411925&r2=411926&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Mon Jun 5 14:43:42 2006
@@ -70,8 +70,8 @@
* file, and can test paths against those rules.
*/
public static class RobotRuleSet {
- ArrayList tmpEntries;
- RobotsEntry[] entries;
+ ArrayList tmpEntries = new ArrayList();
+ RobotsEntry[] entries = null;
long expireTime;
/**
Modified:
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=411926&r1=411925&r2=411926&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
Mon Jun 5 14:43:42 2006
@@ -25,7 +25,29 @@
private static final String CR= "\r";
private static final String CRLF= "\r\n";
-
+ private static final boolean[] ACCEPT_ALL = {
+ true, // "/a",
+ true, // "/a/",
+ true, // "/a/bloh/foo.html"
+ true, // "/b",
+ true, // "/b/a",
+ true, // "/b/a/index.html",
+ true, // "/b/b/foo.html",
+ true, // "/c",
+ true, // "/c/a",
+ true, // "/c/a/index.html",
+ true, // "/c/b/foo.html",
+ true, // "/d",
+ true, // "/d/a",
+ true, // "/e/a/index.html",
+ true, // "/e/d",
+ true, // "/e/d/foo.html",
+ true, // "/e/doh.html",
+ true, // "/f/index.html",
+ true, // "/foo/bar.html",
+ true, // "/f/",
+ };
+
private static final String[] ROBOTS_STRINGS= new String[] {
"User-Agent: Agent1 #foo" + CR
+ "Disallow: /a" + CR
@@ -40,6 +62,7 @@
+ "" + CR
+ "User-Agent: *" + CR
+ "Disallow: /foo/bar/" + CR,
+ null // Used to test EMPTY_RULES
};
private static final String[] AGENT_STRINGS= new String[] {
@@ -57,7 +80,14 @@
false,
false,
true,
- }
+ },
+ {
+ false,
+ false,
+ false,
+ false,
+ true,
+ }
};
private static final String[] TEST_PATHS= new String[] {
@@ -195,6 +225,13 @@
false, // "/foo/bar.html",
true, // "/f/",
}
+ },
+ { // ROBOTS_STRINGS[1]
+ ACCEPT_ALL, // Agent 1
+ ACCEPT_ALL, // Agent 2
+ ACCEPT_ALL, // Agent 3
+ ACCEPT_ALL, // Agent 4
+ ACCEPT_ALL, // Agent 5
}
};
@@ -233,7 +270,9 @@
for (int i= 1; i < agents.length; i++)
agentsString= agentsString + "," + agents[i];
RobotRulesParser p= new RobotRulesParser(agents);
- RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString].getBytes());
+ RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
+ ? ROBOTS_STRINGS[robotsString].getBytes()
+ : null);
for (int i= 0; i < paths.length; i++) {
assertTrue("testing robots file "+robotsString+", on agents ("
+ agentsString + "), and path " + TEST_PATHS[i] + "; got "
@@ -243,4 +282,6 @@
}
}
+
+
}