Author: ab
Date: Fri Oct  9 13:11:15 2009
New Revision: 823540

URL: http://svn.apache.org/viewvc?rev=823540&view=rev
Log:
NUTCH-731 Redirection of robots.txt in RobotRulesParser.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823540&r1=823539&r2=823540&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Oct  9 13:11:15 2009
@@ -15,6 +15,8 @@
 * NUTCH-730 - NPE in LinkRank if no nodes with which to create the WebGraph
   (Dennis Kubes via ab)
 
+* NUTCH-731 - Redirection of robots.txt in RobotRulesParser (Julien Nioche via 
ab)
+
 Release 1.0 - 2009-03-23
 
  1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=823540&r1=823539&r2=823540&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 Fri Oct  9 13:11:15 2009
@@ -434,10 +434,29 @@
     boolean cacheRule = true;
     
     if (robotRules == null) {                     // cache miss
+      URL redir = null;
       if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
       try {
         Response response = http.getResponse(new URL(url, "/robots.txt"),
                                              new CrawlDatum(), true);
+        // try one level of redirection ?
+        if (response.getCode() == 301 || response.getCode() == 302) {
+          String redirection = response.getHeader("Location");
+          if (redirection == null) {
+            // some versions of MS IIS are known to mangle this header
+            redirection = response.getHeader("location");
+          }
+          if (redirection != null) {
+            if (!redirection.startsWith("http")) {
+              // RFC says it should be absolute, but apparently it isn't
+              redir = new URL(url, redirection);
+            } else {
+              redir = new URL(redirection);
+            }
+            
+            response = http.getResponse(redir, new CrawlDatum(), true);
+          }
+        }
 
         if (response.getCode() == 200)               // found rules: parse them
           robotRules = parseRules(response.getContent());
@@ -456,8 +475,12 @@
         robotRules = EMPTY_RULES;
       }
 
-      if (cacheRule){
+      if (cacheRule) {
         CACHE.put(host, robotRules);  // cache rules for host
+        if (redir != null && !redir.getHost().equals(host)) {
+          // cache also for the redirected host
+          CACHE.put(redir.getHost(), robotRules);
+        }
       }
     }
     return robotRules;


Reply via email to