This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit e0a27c7870d632966d584cf45399b98ba77e2bd6 Author: Sebastian Nagel <[email protected]> AuthorDate: Sun Dec 17 16:13:09 2017 +0100 NUTCH-2034 CrawlDB update job to count documents in CrawlDb rejected by URL filters (patch contributed by Luis Lopez) --- src/java/org/apache/nutch/crawl/CrawlDb.java | 12 +++++++++++- src/java/org/apache/nutch/crawl/CrawlDbFilter.java | 5 ++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index 080b037..9f37447 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -115,8 +115,9 @@ public class CrawlDb extends NutchTool implements Tool { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: Merging segment data into db."); } + RunningJob crawlDBJob = null; try { - JobClient.runJob(job); + crawlDBJob = JobClient.runJob(job); } catch (IOException e) { FileSystem fs = crawlDb.getFileSystem(getConf()); LockUtil.removeLockFile(fs, lock); @@ -127,6 +128,15 @@ public class CrawlDb extends NutchTool implements Tool { } CrawlDb.install(job, crawlDb); + + if (filter) { + long urlsFiltered = crawlDBJob.getCounters() + .findCounter("CrawlDB filter", "URLs filtered").getValue(); + LOG.info( + "CrawlDb update: Total number of existing URLs in CrawlDb rejected by URL filters: {}", + urlsFiltered); + } + long end = System.currentTimeMillis(); LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java index 7b2aa80..8b46ecb 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java @@ -111,7 +111,10 @@ public class CrawlDbFilter implements url = null; } } - if (url != null) { // if it passes + if (url == null) { + reporter.getCounter("CrawlDB filter", "URLs filtered").increment(1); + } else { + // URL has passed filters newKey.set(url); // collect it output.collect(newKey, value); } -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
