This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 810b1d6ad NUTCH-3010 Injector: count unique number of injected URLs -
add counter urls_injected_unique - improve log messages reporting the counts of
injected/merged URLs
810b1d6ad is described below
commit 810b1d6ad50fa9021469b4ca5e1db9050a3263c5
Author: Sebastian Nagel <[email protected]>
AuthorDate: Sat Sep 30 08:09:18 2023 +0200
NUTCH-3010 Injector: count unique number of injected URLs
- add counter urls_injected_unique
- improve log messages reporting the counts of injected/merged URLs
---
src/java/org/apache/nutch/crawl/Injector.java | 31 ++++++++++++++++-----------
1 file changed, 18 insertions(+), 13 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/Injector.java
b/src/java/org/apache/nutch/crawl/Injector.java
index b93e8ca76..9fca719f6 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -341,8 +341,11 @@ public class Injector extends NutchTool implements Tool {
? injected.getFetchInterval() : old.getFetchInterval());
}
}
- if (injectedSet && oldSet) {
- context.getCounter("injector", "urls_merged").increment(1);
+ if (injectedSet) {
+ context.getCounter("injector", "urls_injected_unique").increment(1);
+ if (oldSet) {
+ context.getCounter("injector", "urls_merged").increment(1);
+ }
}
context.write(key, result);
}
@@ -448,22 +451,24 @@ public class Injector extends NutchTool implements Tool {
if (LOG.isInfoEnabled()) {
long urlsInjected = job.getCounters()
.findCounter("injector", "urls_injected").getValue();
+ long urlsInjectedUniq = job.getCounters()
+ .findCounter("injector", "urls_injected_unique").getValue();
long urlsFiltered = job.getCounters()
.findCounter("injector", "urls_filtered").getValue();
long urlsMerged = job.getCounters()
.findCounter("injector", "urls_merged").getValue();
- long urlsPurged404= job.getCounters()
+ long urlsPurged404 = job.getCounters()
.findCounter("injector", "urls_purged_404").getValue();
- long urlsPurgedFilter= job.getCounters()
+ long urlsPurgedFilter = job.getCounters()
.findCounter("injector", "urls_purged_filter").getValue();
- LOG.info("Injector: Total urls rejected by filters: " + urlsFiltered);
+ LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered);
LOG.info(
- "Injector: Total urls injected after normalization and filtering: "
- + urlsInjected);
- LOG.info("Injector: Total urls injected but already in CrawlDb: "
- + urlsMerged);
- LOG.info("Injector: Total new urls injected: "
- + (urlsInjected - urlsMerged));
+ "Injector: Total urls injected after normalization and filtering:
{} (unique URLs: {})",
+ urlsInjected, urlsInjectedUniq);
+ LOG.info("Injector: Total urls injected but already in CrawlDb: {}",
+ urlsMerged);
+ LOG.info("Injector: Total new urls injected: {}",
+ (urlsInjectedUniq - urlsMerged));
if (filterNormalizeAll) {
LOG.info("Injector: Total urls removed from CrawlDb by filters: {}",
urlsPurgedFilter);
@@ -475,8 +480,8 @@ public class Injector extends NutchTool implements Tool {
}
long end = System.currentTimeMillis();
- LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
- + TimingUtil.elapsedTime(start, end));
+ LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end),
+ TimingUtil.elapsedTime(start, end));
}
} catch (IOException | InterruptedException | ClassNotFoundException |
NullPointerException e) {
LOG.error("Injector job failed: {}", e.getMessage());