This is an automated email from the ASF dual-hosted git repository.
markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new b52ec9025 NUTCH-3100 HostDB to support minimum records per host
b52ec9025 is described below
commit b52ec9025e40152b3a1dae7c78bb803c7ad298ce
Author: Markus Jelsma <[email protected]>
AuthorDate: Thu Jan 9 13:50:17 2025 +0200
NUTCH-3100 HostDB to support minimum records per host
---
src/java/org/apache/nutch/hostdb/UpdateHostDb.java | 15 +++++++++++----
src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java | 10 ++++++++++
2 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
index 5148a6be1..c8b8c43cf 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
@@ -68,10 +68,11 @@ public class UpdateHostDb extends Configured implements
Tool {
public static final String HOSTDB_STRING_FIELDS = "hostdb.string.fields";
public static final String HOSTDB_PERCENTILES = "hostdb.percentiles";
public static final String HOSTDB_CRAWLDATUM_PROCESSORS =
"hostdb.crawldatum.processors";
+ public static final String HOSTDB_URL_LIMIT = "hostdb.url.limit";
private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
boolean checkFailed, boolean checkNew, boolean checkKnown,
- boolean force, boolean filter, boolean normalize) throws Exception {
+ boolean force, boolean filter, boolean normalize, long urlLimit) throws
Exception {
StopWatch stopWatch = new StopWatch();
stopWatch.start();
@@ -126,6 +127,7 @@ public class UpdateHostDb extends Configured implements
Tool {
conf.setBoolean(HOSTDB_FORCE_CHECK, force);
conf.setBoolean(HOSTDB_URL_FILTERING, filter);
conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize);
+ conf.setLong(HOSTDB_URL_LIMIT, urlLimit);
conf.setClassLoader(Thread.currentThread().getContextClassLoader());
try {
@@ -163,7 +165,7 @@ public class UpdateHostDb extends Configured implements
Tool {
if (args.length < 2) {
System.err.println("Usage: UpdateHostDb -hostdb <hostdb> " +
"[-tophosts <tophosts>] [-crawldb <crawldb>] [-checkAll]
[-checkFailed]" +
- " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize]");
+ " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize] [-urlLimit
<N>]");
return -1;
}
@@ -175,9 +177,9 @@ public class UpdateHostDb extends Configured implements
Tool {
boolean checkNew = false;
boolean checkKnown = false;
boolean force = false;
-
boolean filter = false;
boolean normalize = false;
+ long urlLimit = -1l;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-hostdb")) {
@@ -226,6 +228,11 @@ public class UpdateHostDb extends Configured implements
Tool {
LOG.info("UpdateHostDb: normalizing enabled");
normalize = true;
}
+ if (args[i].equals("-urlLimit")) {
+ urlLimit = Long.valueOf(args[i + 1]);
+ LOG.info("UpdateHostDb: URL limit set to " + urlLimit);
+ i++;
+ }
}
if (hostDb == null) {
@@ -235,7 +242,7 @@ public class UpdateHostDb extends Configured implements
Tool {
try {
updateHostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew,
- checkKnown, force, filter, normalize);
+ checkKnown, force, filter, normalize, urlLimit);
return 0;
} catch (Exception e) {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index 1e41fb6df..2c13756ab 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -60,6 +60,7 @@ public class UpdateHostDbReducer
protected static boolean checkKnown = false;
protected static boolean checkAny = false;
protected static boolean force = false;
+ protected static long urlLimit = -1l;
protected static long now = new Date().getTime();
protected static String[] numericFields;
protected static String[] stringFields;
@@ -85,6 +86,7 @@ public class UpdateHostDbReducer
checkKnown = conf.getBoolean(UpdateHostDb.HOSTDB_CHECK_KNOWN, false);
checkAny = checkNew || checkKnown || checkFailed;
force = conf.getBoolean(UpdateHostDb.HOSTDB_FORCE_CHECK, false);
+ urlLimit = conf.getLong(UpdateHostDb.HOSTDB_URL_LIMIT,-1l);
numericFields = conf.getStrings(UpdateHostDb.HOSTDB_NUMERIC_FIELDS);
stringFields = conf.getStrings(UpdateHostDb.HOSTDB_STRING_FIELDS);
percentiles = conf.getInts(UpdateHostDb.HOSTDB_PERCENTILES);
@@ -374,6 +376,14 @@ public class UpdateHostDbReducer
hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new
FloatWritable(entry.getValue()));
}
+ // Impose limits on minimum number of URLs?
+ if (urlLimit > -1l) {
+ if (hostDatum.numRecords() < urlLimit) {
+ context.getCounter("UpdateHostDb",
"url_limit_not_reached").increment(1);
+ return;
+ }
+ }
+
context.getCounter("UpdateHostDb", "total_hosts").increment(1);
// See if this record is to be checked