Author: ab Date: Fri Oct 9 13:11:15 2009 New Revision: 823540 URL: http://svn.apache.org/viewvc?rev=823540&view=rev Log: NUTCH-731 Redirection of robots.txt in RobotRulesParser.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823540&r1=823539&r2=823540&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 13:11:15 2009 @@ -15,6 +15,8 @@ * NUTCH-730 - NPE in LinkRank if no nodes with which to create the WebGraph (Dennis Kubes via ab) +* NUTCH-731 - Redirection of robots.txt in RobotRulesParser (Julien Nioche via ab) + Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=823540&r1=823539&r2=823540&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Fri Oct 9 13:11:15 2009 @@ -434,10 +434,29 @@ boolean cacheRule = true; if (robotRules == null) { // cache miss + URL redir = null; if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); } try { Response response = http.getResponse(new URL(url, "/robots.txt"), new CrawlDatum(), true); + // try one level of redirection ? + if (response.getCode() == 301 || response.getCode() == 302) { + String redirection = response.getHeader("Location"); + if (redirection == null) { + // some versions of MS IIS are known to mangle this header + redirection = response.getHeader("location"); + } + if (redirection != null) { + if (!redirection.startsWith("http")) { + // RFC says it should be absolute, but apparently it isn't + redir = new URL(url, redirection); + } else { + redir = new URL(redirection); + } + + response = http.getResponse(redir, new CrawlDatum(), true); + } + } if (response.getCode() == 200) // found rules: parse them robotRules = parseRules(response.getContent()); @@ -456,8 +475,12 @@ robotRules = EMPTY_RULES; } - if (cacheRule){ + if (cacheRule) { CACHE.put(host, robotRules); // cache rules for host + if (redir != null && !redir.getHost().equals(host)) { + // cache also for the redirected host + CACHE.put(redir.getHost(), robotRules); + } } } return robotRules;