Author: tejasp
Date: Mon Jan 28 07:59:41 2013
New Revision: 1439291

URL: http://svn.apache.org/viewvc?rev=1439291&view=rev
Log:
NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1439291&r1=1439290&r2=1439291&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Jan 28 07:59:41 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas 
Patil)
+
 * NUTCH-1453 Substantiate tests for IndexingFilters (lufeng via lewismc)
 
 * NUTCH-1274 Fix [cast] javac warnings (Tejas Patil via lewismc)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1439291&r1=1439290&r2=1439291&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
Mon Jan 28 07:59:41 2013
@@ -484,7 +484,8 @@ extends GoraReducer<IntWritable, FetchEn
             reprUrl = TableUtil.toString(fit.page.getReprUrl());
           }
           try {
-            LOG.info("fetching " + fit.url);
+            LOG.info("fetching " + fit.url + " (queue crawl delay=" + 
+                      fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + 
"ms)"); 
 
             // fetch the page
             final Protocol protocol = 
this.protocolFactory.getProtocol(fit.url);
@@ -500,7 +501,7 @@ extends GoraReducer<IntWritable, FetchEn
               continue;
             }
             if (rules.getCrawlDelay() > 0) {
-              if (rules.getCrawlDelay() > maxCrawlDelay) {
+              if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) 
{
                 // unblock
                 fetchQueues.finishFetchItem(fit, true);
                 LOG.debug("Crawl-Delay for " + fit.url + " too long (" + 
rules.getCrawlDelay() + "), skipping");
@@ -509,6 +510,9 @@ extends GoraReducer<IntWritable, FetchEn
               } else {
                 final FetchItemQueue fiq = 
fetchQueues.getFetchItemQueue(fit.queueID);
                 fiq.crawlDelay = rules.getCrawlDelay();
+                if (LOG.isDebugEnabled()) {
+                  LOG.info("Crawl delay for queue: " + fit.queueID + " is set 
to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
+                }
               }
             }
             final ProtocolOutput output = protocol.getProtocolOutput(fit.url, 
fit.page);
@@ -875,3 +879,4 @@ extends GoraReducer<IntWritable, FetchEn
     LOG.info("-activeThreads=" + activeThreads);
   }
 }
+


Reply via email to