This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new eae3c52a8 NUTCH-2993 ScoringDepth plugin to skip depth check based on URL Pattern - apply patch contributed by Markus Jelsma eae3c52a8 is described below commit eae3c52a8140344dff46c448664a2467d631cefc Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Thu Jul 20 13:44:26 2023 +0200 NUTCH-2993 ScoringDepth plugin to skip depth check based on URL Pattern - apply patch contributed by Markus Jelsma --- conf/nutch-default.xml | 16 ++++++++++++++ .../nutch/scoring/depth/DepthScoringFilter.java | 25 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 273cfccc5..379b5ef5d 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1918,6 +1918,22 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this </description> </property> +<property> + <name>scoring.depth.override.pattern</name> + <value></value> + <description>URLs matching this pattern pass a different max depth value + to their outlinks configured in scoring.depth.max.override. + </description> +</property> + +<property> + <name>scoring.depth.max.override</name> + <value></value> + <description>This max depth value is passed to outlinks matching the pattern + configured in scoring.depth.override.pattern. + </description> +</property> + <!-- scoring similarity properties Add scoring-similarity to the list of active plugins in the parameter 'plugin.includes' in order to use it. diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java index e6aa7a642..6fdf9edd6 100644 --- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java +++ b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java @@ -21,6 +21,8 @@ import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,6 +61,8 @@ public class DepthScoringFilter extends Configured implements ScoringFilter { public static final int DEFAULT_MAX_DEPTH = 1000; private int defaultMaxDepth; + private Pattern depthOverridePattern = null; + private int maxDepthOverride = -1; @Override public void setConf(Configuration conf) { @@ -69,6 +73,16 @@ public class DepthScoringFilter extends Configured implements ScoringFilter { if (defaultMaxDepth <= 0) { defaultMaxDepth = DEFAULT_MAX_DEPTH; } + String depthOverrideStr = conf.get("scoring.depth.override.pattern"); + if (depthOverrideStr != null && !depthOverrideStr.isEmpty()) { + try { + depthOverridePattern = Pattern.compile(depthOverrideStr); + maxDepthOverride = conf.getInt("scoring.depth.max.override", 10); + } catch (Exception e) { + LOG.warn("Unable to compile scoring.depth.override.pattern because: {}", + e.getMessage(), e); + } + } } @Override @@ -93,6 +107,17 @@ public class DepthScoringFilter extends Configured implements ScoringFilter { curMaxDepth = Integer.parseInt(maxDepthString); customMaxDepth = new IntWritable(curMaxDepth); } + // If URL matches the pattern, we'll override maxDepth + if (depthOverridePattern != null) { + Matcher matcher = depthOverridePattern.matcher(fromUrl.toString()); + if (matcher.find()) { + curMaxDepth = maxDepthOverride; + customMaxDepth = new IntWritable(maxDepthOverride); + } else { + curMaxDepth = defaultMaxDepth; + customMaxDepth = new IntWritable(curMaxDepth); + } + } if (curDepth >= curMaxDepth) { // depth exceeded - throw away LOG.info("Depth limit (" + curMaxDepth