(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler

2024-03-13 Thread markus
This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new a8ec17ca8 NUTCH-3029 Host specific max. and min. intervals in adaptive 
scheduler
a8ec17ca8 is described below

commit a8ec17ca853b2488bf5d96538915a00a05064a31
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 18:35:22 2024 +

NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
---
 src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 4d4a3af73..64719cdae 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -192,6 +192,7 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
*
* @param url url to get hostname for
* @return hostname
+   * @throws URISyntaxException
*/
   public static String getHostName(String url) throws URISyntaxException {
 URI uri = new URI(url);



(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler

2024-03-13 Thread markus
This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 84cda2abd NUTCH-3029 Host specific max. and min. intervals in adaptive 
scheduler
84cda2abd is described below

commit 84cda2abd500667222fdb00e503780ee0bdaaab4
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 16:12:21 2024 +

NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
---
 .../org/apache/nutch/crawl/AdaptiveFetchSchedule.java   | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index a403d5649..4d4a3af73 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -189,6 +189,9 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 
   /**
* Strip a URL, leaving only the host name.
+   *
+   * @param url url to get hostname for
+   * @return hostname
*/
   public static String getHostName(String url) throws URISyntaxException {
 URI uri = new URI(url);
@@ -198,9 +201,10 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 
   /**
* Returns the max_interval for this URL, which might depend on the host.
-   * @param  url  the URL to be scheduled
-   * @param  defaultMaxInterval  the value to which to default
-   * if max_interval has not been configured for this host
+   *
+   * @param url the URL to be scheduled
+   * @param defaultMaxInterval the value to which to default if max_interval 
has not been configured for this host
+   * @return the configured maximum interval or the default interval
*/
   public float getMaxInterval(Text url, float defaultMaxInterval){
 if (hostSpecificMaxInterval.isEmpty()) {
@@ -220,9 +224,10 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 
   /**
* Returns the min_interval for this URL, which might depend on the host.
-   * @param  url  the URL to be scheduled
-   * @param  defaultMinInterval  the value to which to default
-   * if min_interval has not been configured for this host
+   *
+   * @param url the URL to be scheduled
+   * @param defaultMinInterval the value to which to default if min_interval 
has not been configured for this host
+   * @return the configured minimum interval or the default interval
*/
   public float getMinInterval(Text url, float defaultMinInterval){
 if (hostSpecificMinInterval.isEmpty()) {



(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler

2024-03-13 Thread markus
This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 5ba50c0c6 NUTCH-3029 Host specific max. and min. intervals in adaptive 
scheduler
5ba50c0c6 is described below

commit 5ba50c0c6091a95818d3788f0d5b7c0ff49bec57
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 14:53:10 2024 +

NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
---
 .../apache/nutch/crawl/AdaptiveFetchSchedule.java  | 159 -
 1 file changed, 155 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 5bccd4f30..a403d5649 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -22,11 +22,20 @@ import org.apache.hadoop.io.FloatWritable;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.commons.lang.StringUtils;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Map;
+import java.util.HashMap;
 import java.lang.invoke.MethodHandles;
+import java.net.URI;
+import java.net.URISyntaxException;
 
 /**
  * This class implements an adaptive re-fetch algorithm. This works as follows:
@@ -79,9 +88,16 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 
   private double SYNC_DELTA_RATE;
 
+  private Configuration conf;
+
+  private Map hostSpecificMaxInterval = new HashMap<>();
+  
+  private Map hostSpecificMinInterval = new HashMap<>();
+
   @Override
   public void setConf(Configuration conf) {
 super.setConf(conf);
+this.conf = conf;
 if (conf == null)
   return;
 INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
@@ -92,6 +108,136 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", 
true);
 SYNC_DELTA_RATE = conf.getFloat(
 "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
+try {
+  setHostSpecificIntervals("adaptive-host-specific-intervals.txt", 
+MIN_INTERVAL, MAX_INTERVAL);
+} catch (IOException e){
+  LOG.error("Failed reading the configuration file. ", e);
+}
+  }
+
+  /**
+   * Load host-specific min_intervals and max_intervals
+   * from the configuration file into the HashMaps.
+   */
+  private void setHostSpecificIntervals(String fileName,
+float defaultMin, float defaultMax) throws IOException {
+Reader configReader = null;
+configReader = conf.getConfResourceAsReader(fileName);
+if (configReader == null) {
+  configReader = new FileReader(fileName);
+}
+BufferedReader reader = new BufferedReader(configReader);
+String line;
+int lineNo = 0;
+while ((line = reader.readLine()) != null) {
+  lineNo++;
+  if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+line = line.trim();
+String[] parts = line.split("\\s+");
+if (parts.length == 3) {
+  // TODO: Maybe add host validatio here?
+  // It might get computationally expensive for large files, though.
+  String host = parts[0].trim().toLowerCase();
+  String minInt = parts[1].trim();
+  String maxInt = parts[2].trim();
+  if (minInt.equalsIgnoreCase("default")){ minInt = "0"; }
+  if (maxInt.equalsIgnoreCase("default")){ maxInt = "0"; }
+  float m,M;
+  try {
+m = Float.parseFloat(minInt);
+M = Float.parseFloat(maxInt);
+
+//negative values and mismatched boundaries are ignored
+//(default to global settings)
+if (m < 0 || M < 0 || m > M){
+  LOG.error("Improper fetch intervals given on line " + 
String.valueOf(lineNo)
++ " in the config. file: " + line);
+} else {
+
+  // min. interval should be positive and above the global minimum
+  if (m > 0 && m > defaultMin){
+  hostSpecificMinInterval.put(host,m);
+  LOG.debug("Added custom min. interval " + m + " for host " + 
host + ".");
+  } else if (m > 0) {
+LOG.error("Min. interval out of bounds on line " + 
String.valueOf(lineNo)
+  + " in the config. file: " + line);
+  }
+
+  // max. interval should be positive and below the global maximum
+  if (M > 0 && M < defaultMax){
+hostSpecificMaxInterval.put(host,M);
+LOG.debug("Added custom max. interval " + M + " 

(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler

2024-03-13 Thread markus
This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 4642c30c2 NUTCH-3029 Host specific max. and min. intervals in adaptive 
scheduler
4642c30c2 is described below

commit 4642c30c2aeb2a1fa2436541bd4af877d0aad86a
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 12:58:05 2024 +

NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
---
 conf/adaptive-host-specific-intervals.txt.template | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/conf/adaptive-host-specific-intervals.txt.template 
b/conf/adaptive-host-specific-intervals.txt.template
new file mode 100644
index 0..4aa7920d3
--- /dev/null
+++ b/conf/adaptive-host-specific-intervals.txt.template
@@ -0,0 +1,14 @@
+# This file defines a mapping that associates specific min. and max. 
refetching time intervals
+# to a host, that deviate from the default settings of the 
AdaptiveFetchSchedule class.
+#
+# Format:.
+#
+# The two values will be parsed as float and should be STRICTLY between
+# db.fetch.schedule.adaptive.min_interval and 
db.fetch.schedule.adaptive.max_interval.
+#
+# To use default values, write "default" or "0".
+# The default min. is 60 (1 min) and default max. is 31536000 (1 year).
+#
+www.apache.org   default 1728000
+www.example.org  1296000 0
+nutch.apache.org 864000  216