Author: ab Date: Wed Nov 25 17:20:33 2009 New Revision: 884203 URL: http://svn.apache.org/viewvc?rev=884203&view=rev Log: NUTCH-753 Prevent new Fetcher from retrieving the robots twice.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884203&r1=884202&r2=884203&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 17:20:33 2009 @@ -2,7 +2,9 @@ Unreleased Changes -* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab) +* NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab) + +* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab via ab) * NUTCH-765 - Allow Crawl class to call Either Solr or Lucene Indexer (kubes) Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=884203&r1=884202&r2=884203&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Wed Nov 25 17:20:33 2009 @@ -185,6 +185,7 @@ String urlString = url.toString(); try { URL u = new URL(urlString); + long delay = serverDelay; if (checkRobots) { try { @@ -197,10 +198,10 @@ logger.trace("Exception checking robot rules for " + url + ": " + e); } } + + long crawlDelay = robots.getCrawlDelay(this, u); + delay = crawlDelay > 0 ? crawlDelay : serverDelay; } - - long crawlDelay = robots.getCrawlDelay(this, u); - long delay = crawlDelay > 0 ? crawlDelay : serverDelay; if (checkBlocking && maxCrawlDelay >= 0 && delay > maxCrawlDelay) { // skip this page, otherwise the thread would block for too long. LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max="