This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new b52ec9025 NUTCH-3100 HostDB to support minimum records per host
b52ec9025 is described below

commit b52ec9025e40152b3a1dae7c78bb803c7ad298ce
Author: Markus Jelsma <[email protected]>
AuthorDate: Thu Jan 9 13:50:17 2025 +0200

    NUTCH-3100 HostDB to support minimum records per host
---
 src/java/org/apache/nutch/hostdb/UpdateHostDb.java        | 15 +++++++++++----
 src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java | 10 ++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java 
b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
index 5148a6be1..c8b8c43cf 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
@@ -68,10 +68,11 @@ public class UpdateHostDb extends Configured implements 
Tool {
   public static final String HOSTDB_STRING_FIELDS = "hostdb.string.fields";
   public static final String HOSTDB_PERCENTILES = "hostdb.percentiles";
   public static final String HOSTDB_CRAWLDATUM_PROCESSORS = 
"hostdb.crawldatum.processors";
+  public static final String HOSTDB_URL_LIMIT = "hostdb.url.limit";
   
   private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
     boolean checkFailed, boolean checkNew, boolean checkKnown,
-    boolean force, boolean filter, boolean normalize) throws Exception {
+    boolean force, boolean filter, boolean normalize, long urlLimit) throws 
Exception {
 
     StopWatch stopWatch = new StopWatch();
     stopWatch.start();
@@ -126,6 +127,7 @@ public class UpdateHostDb extends Configured implements 
Tool {
     conf.setBoolean(HOSTDB_FORCE_CHECK, force);
     conf.setBoolean(HOSTDB_URL_FILTERING, filter);
     conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize);
+    conf.setLong(HOSTDB_URL_LIMIT, urlLimit);
     conf.setClassLoader(Thread.currentThread().getContextClassLoader());
     
     try {
@@ -163,7 +165,7 @@ public class UpdateHostDb extends Configured implements 
Tool {
     if (args.length < 2) {
       System.err.println("Usage: UpdateHostDb -hostdb <hostdb> " +
         "[-tophosts <tophosts>] [-crawldb <crawldb>] [-checkAll] 
[-checkFailed]" +
-        " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize]");
+        " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize] [-urlLimit 
<N>]");
       return -1;
     }
 
@@ -175,9 +177,9 @@ public class UpdateHostDb extends Configured implements 
Tool {
     boolean checkNew = false;
     boolean checkKnown = false;
     boolean force = false;
-
     boolean filter = false;
     boolean normalize = false;
+    long urlLimit = -1l;
 
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-hostdb")) {
@@ -226,6 +228,11 @@ public class UpdateHostDb extends Configured implements 
Tool {
         LOG.info("UpdateHostDb: normalizing enabled");
         normalize = true;
       }
+      if (args[i].equals("-urlLimit")) {
+        urlLimit = Long.valueOf(args[i + 1]);
+        LOG.info("UpdateHostDb: URL limit set to " + urlLimit);
+        i++;
+      }
     }
 
     if (hostDb == null) {
@@ -235,7 +242,7 @@ public class UpdateHostDb extends Configured implements 
Tool {
 
     try {
       updateHostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew,
-        checkKnown, force, filter, normalize);
+        checkKnown, force, filter, normalize, urlLimit);
 
       return 0;
     } catch (Exception e) {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java 
b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index 1e41fb6df..2c13756ab 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -60,6 +60,7 @@ public class UpdateHostDbReducer
   protected static boolean checkKnown = false;
   protected static boolean checkAny = false;
   protected static boolean force = false;
+  protected static long urlLimit = -1l;
   protected static long now = new Date().getTime();
   protected static String[] numericFields;
   protected static String[] stringFields;
@@ -85,6 +86,7 @@ public class UpdateHostDbReducer
     checkKnown = conf.getBoolean(UpdateHostDb.HOSTDB_CHECK_KNOWN, false);
     checkAny = checkNew || checkKnown || checkFailed;
     force = conf.getBoolean(UpdateHostDb.HOSTDB_FORCE_CHECK, false);
+    urlLimit = conf.getLong(UpdateHostDb.HOSTDB_URL_LIMIT,-1l);
     numericFields = conf.getStrings(UpdateHostDb.HOSTDB_NUMERIC_FIELDS);
     stringFields = conf.getStrings(UpdateHostDb.HOSTDB_STRING_FIELDS);
     percentiles = conf.getInts(UpdateHostDb.HOSTDB_PERCENTILES);
@@ -374,6 +376,14 @@ public class UpdateHostDbReducer
       hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new 
FloatWritable(entry.getValue()));
     }
     
+    // Impose limits on minimum number of URLs?
+    if (urlLimit > -1l) {
+      if (hostDatum.numRecords() < urlLimit) {
+        context.getCounter("UpdateHostDb", 
"url_limit_not_reached").increment(1);
+        return;
+      }
+    }
+    
     context.getCounter("UpdateHostDb", "total_hosts").increment(1);
 
     // See if this record is to be checked

Reply via email to