This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 810b1d6ad NUTCH-3010 Injector: count unique number of injected URLs - add counter urls_injected_unique - improve log messages reporting the counts of injected/merged URLs 810b1d6ad is described below commit 810b1d6ad50fa9021469b4ca5e1db9050a3263c5 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Sat Sep 30 08:09:18 2023 +0200 NUTCH-3010 Injector: count unique number of injected URLs - add counter urls_injected_unique - improve log messages reporting the counts of injected/merged URLs --- src/java/org/apache/nutch/crawl/Injector.java | 31 ++++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index b93e8ca76..9fca719f6 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -341,8 +341,11 @@ public class Injector extends NutchTool implements Tool { ? injected.getFetchInterval() : old.getFetchInterval()); } } - if (injectedSet && oldSet) { - context.getCounter("injector", "urls_merged").increment(1); + if (injectedSet) { + context.getCounter("injector", "urls_injected_unique").increment(1); + if (oldSet) { + context.getCounter("injector", "urls_merged").increment(1); + } } context.write(key, result); } @@ -448,22 +451,24 @@ public class Injector extends NutchTool implements Tool { if (LOG.isInfoEnabled()) { long urlsInjected = job.getCounters() .findCounter("injector", "urls_injected").getValue(); + long urlsInjectedUniq = job.getCounters() + .findCounter("injector", "urls_injected_unique").getValue(); long urlsFiltered = job.getCounters() .findCounter("injector", "urls_filtered").getValue(); long urlsMerged = job.getCounters() .findCounter("injector", "urls_merged").getValue(); - long urlsPurged404= job.getCounters() + long urlsPurged404 = job.getCounters() .findCounter("injector", "urls_purged_404").getValue(); - long urlsPurgedFilter= job.getCounters() + long urlsPurgedFilter = job.getCounters() .findCounter("injector", "urls_purged_filter").getValue(); - LOG.info("Injector: Total urls rejected by filters: " + urlsFiltered); + LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered); LOG.info( - "Injector: Total urls injected after normalization and filtering: " - + urlsInjected); - LOG.info("Injector: Total urls injected but already in CrawlDb: " - + urlsMerged); - LOG.info("Injector: Total new urls injected: " - + (urlsInjected - urlsMerged)); + "Injector: Total urls injected after normalization and filtering: {} (unique URLs: {})", + urlsInjected, urlsInjectedUniq); + LOG.info("Injector: Total urls injected but already in CrawlDb: {}", + urlsMerged); + LOG.info("Injector: Total new urls injected: {}", + (urlsInjectedUniq - urlsMerged)); if (filterNormalizeAll) { LOG.info("Injector: Total urls removed from CrawlDb by filters: {}", urlsPurgedFilter); @@ -475,8 +480,8 @@ public class Injector extends NutchTool implements Tool { } long end = System.currentTimeMillis(); - LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end), + TimingUtil.elapsedTime(start, end)); } } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) { LOG.error("Injector job failed: {}", e.getMessage());