Author: ab
Date: Fri Oct 9 13:11:15 2009
New Revision: 823540
URL: http://svn.apache.org/viewvc?rev=823540&view=rev
Log:
NUTCH-731 Redirection of robots.txt in RobotRulesParser.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823540&r1=823539&r2=823540&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 13:11:15 2009
@@ -15,6 +15,8 @@
* NUTCH-730 - NPE in LinkRank if no nodes with which to create the WebGraph
(Dennis Kubes via ab)
+* NUTCH-731 - Redirection of robots.txt in RobotRulesParser (Julien Nioche via
ab)
+
Release 1.0 - 2009-03-23
1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
Modified:
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=823540&r1=823539&r2=823540&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Fri Oct 9 13:11:15 2009
@@ -434,10 +434,29 @@
boolean cacheRule = true;
if (robotRules == null) { // cache miss
+ URL redir = null;
if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
try {
Response response = http.getResponse(new URL(url, "/robots.txt"),
new CrawlDatum(), true);
+ // try one level of redirection ?
+ if (response.getCode() == 301 || response.getCode() == 302) {
+ String redirection = response.getHeader("Location");
+ if (redirection == null) {
+ // some versions of MS IIS are known to mangle this header
+ redirection = response.getHeader("location");
+ }
+ if (redirection != null) {
+ if (!redirection.startsWith("http")) {
+ // RFC says it should be absolute, but apparently it isn't
+ redir = new URL(url, redirection);
+ } else {
+ redir = new URL(redirection);
+ }
+
+ response = http.getResponse(redir, new CrawlDatum(), true);
+ }
+ }
if (response.getCode() == 200) // found rules: parse them
robotRules = parseRules(response.getContent());
@@ -456,8 +475,12 @@
robotRules = EMPTY_RULES;
}
- if (cacheRule){
+ if (cacheRule) {
CACHE.put(host, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equals(host)) {
+ // cache also for the redirected host
+ CACHE.put(redir.getHost(), robotRules);
+ }
}
}
return robotRules;