This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new eae3c52a8 NUTCH-2993 ScoringDepth plugin to skip depth check based on 
URL Pattern - apply patch contributed by Markus Jelsma
eae3c52a8 is described below

commit eae3c52a8140344dff46c448664a2467d631cefc
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Thu Jul 20 13:44:26 2023 +0200

    NUTCH-2993 ScoringDepth plugin to skip depth check based on URL Pattern
    - apply patch contributed by Markus Jelsma
---
 conf/nutch-default.xml                             | 16 ++++++++++++++
 .../nutch/scoring/depth/DepthScoringFilter.java    | 25 ++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 273cfccc5..379b5ef5d 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1918,6 +1918,22 @@ CAUTION: Set the parser.timeout to -1 or a bigger value 
than 30, when using this
   </description>
 </property>
 
+<property>
+  <name>scoring.depth.override.pattern</name>
+  <value></value>
+  <description>URLs matching this pattern pass a different max depth value
+  to their outlinks configured in scoring.depth.max.override.
+  </description>
+</property>
+
+<property>
+  <name>scoring.depth.max.override</name>
+  <value></value>
+  <description>This max depth value is passed to outlinks matching the pattern
+  configured in scoring.depth.override.pattern.
+  </description>
+</property>
+
 <!-- scoring similarity properties
  Add scoring-similarity to the list of active plugins
  in the parameter 'plugin.includes' in order to use it.
diff --git 
a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
 
b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
index e6aa7a642..6fdf9edd6 100644
--- 
a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ 
b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
@@ -21,6 +21,8 @@ import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -59,6 +61,8 @@ public class DepthScoringFilter extends Configured implements 
ScoringFilter {
   public static final int DEFAULT_MAX_DEPTH = 1000;
 
   private int defaultMaxDepth;
+  private Pattern depthOverridePattern = null;
+  private int maxDepthOverride = -1;
 
   @Override
   public void setConf(Configuration conf) {
@@ -69,6 +73,16 @@ public class DepthScoringFilter extends Configured 
implements ScoringFilter {
     if (defaultMaxDepth <= 0) {
       defaultMaxDepth = DEFAULT_MAX_DEPTH;
     }
+    String depthOverrideStr = conf.get("scoring.depth.override.pattern");
+    if (depthOverrideStr != null && !depthOverrideStr.isEmpty()) {
+      try {
+        depthOverridePattern = Pattern.compile(depthOverrideStr);
+        maxDepthOverride = conf.getInt("scoring.depth.max.override", 10);
+      } catch (Exception e) {
+        LOG.warn("Unable to compile scoring.depth.override.pattern because: 
{}",
+            e.getMessage(), e);
+      }
+    }
   }
 
   @Override
@@ -93,6 +107,17 @@ public class DepthScoringFilter extends Configured 
implements ScoringFilter {
       curMaxDepth = Integer.parseInt(maxDepthString);
       customMaxDepth = new IntWritable(curMaxDepth);
     }
+    // If URL matches the pattern, we'll override maxDepth
+    if (depthOverridePattern != null) {
+      Matcher matcher = depthOverridePattern.matcher(fromUrl.toString());
+      if (matcher.find()) {
+        curMaxDepth = maxDepthOverride;
+        customMaxDepth = new IntWritable(maxDepthOverride);
+      } else {
+        curMaxDepth = defaultMaxDepth;
+        customMaxDepth = new IntWritable(curMaxDepth);
+      }
+    }
     if (curDepth >= curMaxDepth) {
       // depth exceeded - throw away
       LOG.info("Depth limit (" + curMaxDepth

Reply via email to