Author: cutting
Date: Wed Nov 9 09:46:16 2005
New Revision: 332089
URL: http://svn.apache.org/viewcvs?rev=332089&view=rev
Log:
Fix to follow redirects to robots.txt
Modified:
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
Modified:
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=332089&r1=332088&r2=332089&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
(original)
+++
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
Wed Nov 9 09:46:16 2005
@@ -53,6 +53,9 @@
private static final boolean ALLOW_FORBIDDEN =
NutchConf.get().getBoolean("http.robots.403.allow", false);
+ private static final int MAX_REDIRECTS =
+ NutchConf.get().getInt("http.redirect.max", 3);
+
private static final String[] AGENTS = getAgents();
private static final Hashtable CACHE = new Hashtable();
@@ -377,16 +380,30 @@
RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
if (robotRules == null) { // cache miss
- HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
-
- if (response.getCode() == 200) // found rules: parse them
- robotRules = new RobotRulesParser().parseRules(response.getContent());
- else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
- robotRules = FORBID_ALL_RULES; // use forbid all
- else
- robotRules = EMPTY_RULES; // use default rules
+ int redirects = 0;
+ do {
+ HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+
+ int code = response.getCode();
+
+ if (code == 200) { // found rules: parse them
+ robotRules = new
RobotRulesParser().parseRules(response.getContent());
+ } else if ( (code == 403) && (!ALLOW_FORBIDDEN) ) {
+ robotRules = FORBID_ALL_RULES; // use forbid all
+ } else if (code >= 300 && code < 400) { // handle redirect
+ if (redirects == MAX_REDIRECTS) {
+ robotRules = EMPTY_RULES;
+ } else {
+ url = new URL(url, response.getHeader("Location"));
+ LOG.fine("redirect to " + url);
+ redirects++;
+ }
+ } else {
+ robotRules = EMPTY_RULES; // use default rules
+ }
+ } while (robotRules == null);
- CACHE.put(host, robotRules); // cache rules for host
+ CACHE.put(host, robotRules); // cache rules for host
}
String path = url.getPath(); // check rules