This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 810b1d6ad NUTCH-3010 Injector: count unique number of injected URLs - 
add counter urls_injected_unique - improve log messages reporting the counts of 
injected/merged URLs
810b1d6ad is described below

commit 810b1d6ad50fa9021469b4ca5e1db9050a3263c5
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Sat Sep 30 08:09:18 2023 +0200

    NUTCH-3010 Injector: count unique number of injected URLs
    - add counter urls_injected_unique
    - improve log messages reporting the counts of injected/merged URLs
---
 src/java/org/apache/nutch/crawl/Injector.java | 31 ++++++++++++++++-----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/Injector.java 
b/src/java/org/apache/nutch/crawl/Injector.java
index b93e8ca76..9fca719f6 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -341,8 +341,11 @@ public class Injector extends NutchTool implements Tool {
               ? injected.getFetchInterval() : old.getFetchInterval());
         }
       }
-      if (injectedSet && oldSet) {
-        context.getCounter("injector", "urls_merged").increment(1);
+      if (injectedSet) {
+        context.getCounter("injector", "urls_injected_unique").increment(1);
+        if (oldSet) {
+          context.getCounter("injector", "urls_merged").increment(1);
+        }
       }
       context.write(key, result);
     }
@@ -448,22 +451,24 @@ public class Injector extends NutchTool implements Tool {
       if (LOG.isInfoEnabled()) {
         long urlsInjected = job.getCounters()
             .findCounter("injector", "urls_injected").getValue();
+        long urlsInjectedUniq = job.getCounters()
+            .findCounter("injector", "urls_injected_unique").getValue();
         long urlsFiltered = job.getCounters()
             .findCounter("injector", "urls_filtered").getValue();
         long urlsMerged = job.getCounters()
             .findCounter("injector", "urls_merged").getValue();
-        long urlsPurged404= job.getCounters()
+        long urlsPurged404 = job.getCounters()
             .findCounter("injector", "urls_purged_404").getValue();
-        long urlsPurgedFilter= job.getCounters()
+        long urlsPurgedFilter = job.getCounters()
             .findCounter("injector", "urls_purged_filter").getValue();
-        LOG.info("Injector: Total urls rejected by filters: " + urlsFiltered);
+        LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered);
         LOG.info(
-            "Injector: Total urls injected after normalization and filtering: "
-                + urlsInjected);
-        LOG.info("Injector: Total urls injected but already in CrawlDb: "
-            + urlsMerged);
-        LOG.info("Injector: Total new urls injected: "
-            + (urlsInjected - urlsMerged));
+            "Injector: Total urls injected after normalization and filtering: 
{} (unique URLs: {})",
+            urlsInjected, urlsInjectedUniq);
+        LOG.info("Injector: Total urls injected but already in CrawlDb: {}",
+            urlsMerged);
+        LOG.info("Injector: Total new urls injected: {}",
+            (urlsInjectedUniq - urlsMerged));
         if (filterNormalizeAll) {
           LOG.info("Injector: Total urls removed from CrawlDb by filters: {}",
               urlsPurgedFilter);
@@ -475,8 +480,8 @@ public class Injector extends NutchTool implements Tool {
         }
 
         long end = System.currentTimeMillis();
-        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
-            + TimingUtil.elapsedTime(start, end));
+        LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end),
+            TimingUtil.elapsedTime(start, end));
       }
     } catch (IOException | InterruptedException | ClassNotFoundException | 
NullPointerException e) {
       LOG.error("Injector job failed: {}", e.getMessage());

Reply via email to