This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 524a59480a3e258a0363faf343fa57875f8f9ea8
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Mon Oct 8 14:50:51 2018 +0200

    NUTCH-2630 Fetcher to log skipped records by robots.txt
    - change required log level to INFO (default) for messages
      reporting skipped URLs because of robots.txt rules
      (disallow or crawl delay larger than fetcher.max.crawl.delay)
---
 src/java/org/apache/nutch/fetcher/FetcherThread.java | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index bfcc374..6ba920e 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -302,9 +302,7 @@ public class FetcherThread extends Thread {
             if (!rules.isAllowed(fit.url.toString())) {
               // unblock
               ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
-              if (LOG.isDebugEnabled()) {
-                LOG.debug("Denied by robots.txt: {}", fit.url);
-              }
+              LOG.info("Denied by robots.txt: {}", fit.url);
               output(fit.url, fit.datum, null,
                   ProtocolStatus.STATUS_ROBOTS_DENIED,
                   CrawlDatum.STATUS_FETCH_GONE);
@@ -315,7 +313,7 @@ public class FetcherThread extends Thread {
               if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) 
{
                 // unblock
                 ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
-                LOG.debug("Crawl-Delay for {} too long ({}), skipping", 
fit.url,
+                LOG.info("Crawl-Delay for {} too long ({}), skipping", fit.url,
                     rules.getCrawlDelay());
                 output(fit.url, fit.datum, null,
                     ProtocolStatus.STATUS_ROBOTS_DENIED,

Reply via email to