(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new a8ec17ca8 NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler a8ec17ca8 is described below commit a8ec17ca853b2488bf5d96538915a00a05064a31 Author: Markus Jelsma AuthorDate: Wed Mar 13 18:35:22 2024 + NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler --- src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 4d4a3af73..64719cdae 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -192,6 +192,7 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { * * @param url url to get hostname for * @return hostname + * @throws URISyntaxException */ public static String getHostName(String url) throws URISyntaxException { URI uri = new URI(url);
(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 84cda2abd NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler 84cda2abd is described below commit 84cda2abd500667222fdb00e503780ee0bdaaab4 Author: Markus Jelsma AuthorDate: Wed Mar 13 16:12:21 2024 + NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler --- .../org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 17 +++-- 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index a403d5649..4d4a3af73 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -189,6 +189,9 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { /** * Strip a URL, leaving only the host name. + * + * @param url url to get hostname for + * @return hostname */ public static String getHostName(String url) throws URISyntaxException { URI uri = new URI(url); @@ -198,9 +201,10 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { /** * Returns the max_interval for this URL, which might depend on the host. - * @param url the URL to be scheduled - * @param defaultMaxInterval the value to which to default - * if max_interval has not been configured for this host + * + * @param url the URL to be scheduled + * @param defaultMaxInterval the value to which to default if max_interval has not been configured for this host + * @return the configured maximum interval or the default interval */ public float getMaxInterval(Text url, float defaultMaxInterval){ if (hostSpecificMaxInterval.isEmpty()) { @@ -220,9 +224,10 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { /** * Returns the min_interval for this URL, which might depend on the host. - * @param url the URL to be scheduled - * @param defaultMinInterval the value to which to default - * if min_interval has not been configured for this host + * + * @param url the URL to be scheduled + * @param defaultMinInterval the value to which to default if min_interval has not been configured for this host + * @return the configured minimum interval or the default interval */ public float getMinInterval(Text url, float defaultMinInterval){ if (hostSpecificMinInterval.isEmpty()) {
(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 5ba50c0c6 NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler 5ba50c0c6 is described below commit 5ba50c0c6091a95818d3788f0d5b7c0ff49bec57 Author: Markus Jelsma AuthorDate: Wed Mar 13 14:53:10 2024 + NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler --- .../apache/nutch/crawl/AdaptiveFetchSchedule.java | 159 - 1 file changed, 155 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 5bccd4f30..a403d5649 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -22,11 +22,20 @@ import org.apache.hadoop.io.FloatWritable; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.util.NutchConfiguration; +import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.Reader; +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.IOException; +import java.util.Map; +import java.util.HashMap; import java.lang.invoke.MethodHandles; +import java.net.URI; +import java.net.URISyntaxException; /** * This class implements an adaptive re-fetch algorithm. This works as follows: @@ -79,9 +88,16 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { private double SYNC_DELTA_RATE; + private Configuration conf; + + private Map hostSpecificMaxInterval = new HashMap<>(); + + private Map hostSpecificMinInterval = new HashMap<>(); + @Override public void setConf(Configuration conf) { super.setConf(conf); +this.conf = conf; if (conf == null) return; INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); @@ -92,6 +108,136 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true); SYNC_DELTA_RATE = conf.getFloat( "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f); +try { + setHostSpecificIntervals("adaptive-host-specific-intervals.txt", +MIN_INTERVAL, MAX_INTERVAL); +} catch (IOException e){ + LOG.error("Failed reading the configuration file. ", e); +} + } + + /** + * Load host-specific min_intervals and max_intervals + * from the configuration file into the HashMaps. + */ + private void setHostSpecificIntervals(String fileName, +float defaultMin, float defaultMax) throws IOException { +Reader configReader = null; +configReader = conf.getConfResourceAsReader(fileName); +if (configReader == null) { + configReader = new FileReader(fileName); +} +BufferedReader reader = new BufferedReader(configReader); +String line; +int lineNo = 0; +while ((line = reader.readLine()) != null) { + lineNo++; + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { +line = line.trim(); +String[] parts = line.split("\\s+"); +if (parts.length == 3) { + // TODO: Maybe add host validatio here? + // It might get computationally expensive for large files, though. + String host = parts[0].trim().toLowerCase(); + String minInt = parts[1].trim(); + String maxInt = parts[2].trim(); + if (minInt.equalsIgnoreCase("default")){ minInt = "0"; } + if (maxInt.equalsIgnoreCase("default")){ maxInt = "0"; } + float m,M; + try { +m = Float.parseFloat(minInt); +M = Float.parseFloat(maxInt); + +//negative values and mismatched boundaries are ignored +//(default to global settings) +if (m < 0 || M < 0 || m > M){ + LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo) ++ " in the config. file: " + line); +} else { + + // min. interval should be positive and above the global minimum + if (m > 0 && m > defaultMin){ + hostSpecificMinInterval.put(host,m); + LOG.debug("Added custom min. interval " + m + " for host " + host + "."); + } else if (m > 0) { +LOG.error("Min. interval out of bounds on line " + String.valueOf(lineNo) + + " in the config. file: " + line); + } + + // max. interval should be positive and below the global maximum + if (M > 0 && M < defaultMax){ +hostSpecificMaxInterval.put(host,M); +LOG.debug("Added custom max. interval " + M + "
(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git The following commit(s) were added to refs/heads/master by this push: new 4642c30c2 NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler 4642c30c2 is described below commit 4642c30c2aeb2a1fa2436541bd4af877d0aad86a Author: Markus Jelsma AuthorDate: Wed Mar 13 12:58:05 2024 + NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler --- conf/adaptive-host-specific-intervals.txt.template | 14 ++ 1 file changed, 14 insertions(+) diff --git a/conf/adaptive-host-specific-intervals.txt.template b/conf/adaptive-host-specific-intervals.txt.template new file mode 100644 index 0..4aa7920d3 --- /dev/null +++ b/conf/adaptive-host-specific-intervals.txt.template @@ -0,0 +1,14 @@ +# This file defines a mapping that associates specific min. and max. refetching time intervals +# to a host, that deviate from the default settings of the AdaptiveFetchSchedule class. +# +# Format:. +# +# The two values will be parsed as float and should be STRICTLY between +# db.fetch.schedule.adaptive.min_interval and db.fetch.schedule.adaptive.max_interval. +# +# To use default values, write "default" or "0". +# The default min. is 60 (1 min) and default max. is 31536000 (1 year). +# +www.apache.org default 1728000 +www.example.org 1296000 0 +nutch.apache.org 864000 216