[nutch] 22/23: NUTCH-2034 CrawlDB update job to count documents in CrawlDb rejected by URL filters (patch contributed by Luis Lopez)

snagel Mon, 18 Dec 2017 07:50:19 -0800

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


commit e0a27c7870d632966d584cf45399b98ba77e2bd6
Author: Sebastian Nagel <[email protected]>
AuthorDate: Sun Dec 17 16:13:09 2017 +0100

    NUTCH-2034 CrawlDB update job to count documents in CrawlDb rejected by URL 
filters
    (patch contributed by Luis Lopez)
---
 src/java/org/apache/nutch/crawl/CrawlDb.java       | 12 +++++++++++-
 src/java/org/apache/nutch/crawl/CrawlDbFilter.java |  5 ++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java 
b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 080b037..9f37447 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -115,8 +115,9 @@ public class CrawlDb extends NutchTool implements Tool {
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb update: Merging segment data into db.");
     }
+    RunningJob crawlDBJob = null;
     try {
-      JobClient.runJob(job);
+      crawlDBJob = JobClient.runJob(job);
     } catch (IOException e) {
       FileSystem fs = crawlDb.getFileSystem(getConf());
       LockUtil.removeLockFile(fs, lock);
@@ -127,6 +128,15 @@ public class CrawlDb extends NutchTool implements Tool {
     }
 
     CrawlDb.install(job, crawlDb);
+
+    if (filter) {
+      long urlsFiltered = crawlDBJob.getCounters()
+          .findCounter("CrawlDB filter", "URLs filtered").getValue();
+      LOG.info(
+          "CrawlDb update: Total number of existing URLs in CrawlDb rejected 
by URL filters: {}",
+          urlsFiltered);
+    }
+
     long end = System.currentTimeMillis();
     LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
         + TimingUtil.elapsedTime(start, end));
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java 
b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
index 7b2aa80..8b46ecb 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -111,7 +111,10 @@ public class CrawlDbFilter implements
         url = null;
       }
     }
-    if (url != null) { // if it passes
+    if (url == null) {
+      reporter.getCounter("CrawlDB filter", "URLs filtered").increment(1);
+    } else {
+      // URL has passed filters
       newKey.set(url); // collect it
       output.collect(newKey, value);
     }

-- 
To stop receiving notification emails like this one, please contact
"[email protected]" <[email protected]>.

[nutch] 22/23: NUTCH-2034 CrawlDB update job to count documents in CrawlDb rejected by URL filters (patch contributed by Luis Lopez)

Reply via email to