Author: siren Date: Tue Sep 19 07:52:37 2006 New Revision: 447867 URL: http://svn.apache.org/viewvc?view=rev&rev=447867 Log: NUTCH-105 - Network error during robots.txt fetch causes file to beignored, contributed by Greg Kim
Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diff&rev=447867&r1=447866&r2=447867 ============================================================================== --- lucene/nutch/branches/branch-0.8/CHANGES.txt (original) +++ lucene/nutch/branches/branch-0.8/CHANGES.txt Tue Sep 19 07:52:37 2006 @@ -22,6 +22,9 @@ 7. NUTCH-338 - Remove the text parser as an option for parsing PDF files in parse-plugins.xml (Chris A. Mattmann via siren) + 8. NUTCH-105 - Network error during robots.txt fetch causes file to + beignored (Greg Kim via siren) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=447867&r1=447866&r2=447867 ============================================================================== --- lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Sep 19 07:52:37 2006 @@ -420,6 +420,8 @@ RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host); + boolean cacheRule = true; + if (robotRules == null) { // cache miss if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); } try { @@ -430,16 +432,22 @@ robotRules = parseRules(response.getContent()); else if ( (response.getCode() == 403) && (!allowForbidden) ) robotRules = FORBID_ALL_RULES; // use forbid all - else + else if (response.getCode() >= 500) { + cacheRule = false; + robotRules = EMPTY_RULES; + }else robotRules = EMPTY_RULES; // use default rules } catch (Throwable t) { if (LOG.isInfoEnabled()) { LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); } + cacheRule = false; robotRules = EMPTY_RULES; } - CACHE.put(host, robotRules); // cache rules for host + if (cacheRule){ + CACHE.put(host, robotRules); // cache rules for host + } } return robotRules; } ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys -- and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs