This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit ac383fc5125b6c114a23ef996558ead57e873970 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Wed Nov 8 12:24:24 2023 +0100 [NUTCH-3017] Allow fast-urlfilter to load from HDFS/S3 and support gzipped input - use Hadoop-provided compression codecs - update description of property urlfilter.fast.file --- conf/nutch-default.xml | 10 ++++++++-- .../org/apache/nutch/urlfilter/fast/FastURLFilter.java | 14 ++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index d8bf76486..b20afdfe3 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1872,8 +1872,14 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this <property> <name>urlfilter.fast.file</name> <value>fast-urlfilter.txt</value> - <description>Name of file on CLASSPATH containing regular expressions - used by urlfilter-fast (FastURLFilter) plugin.</description> + <description>Name of file containing rules and regular expressions + used by urlfilter-fast (FastURLFilter) plugin. If the filename + includes a scheme (for example, hdfs://) it is loaded using the + Hadoop FileSystem implementation supporting that scheme. If the + filename does not contain a scheme, the file is loaded from + CLASSPATH. If indicated by file extension (.gz, .bzip2, .zst), + the file is decompressed while reading using Hadoop-provided + compression codecs.</description> </property> <property> diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java index 79ad7b6ca..bb4a11b7c 100644 --- a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java +++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java @@ -21,6 +21,8 @@ import com.google.common.collect.Multimap; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.nutch.net.URLFilter; import org.slf4j.Logger; @@ -35,7 +37,6 @@ import java.io.Reader; import java.net.URL; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import java.util.zip.GZIPInputStream; /** * Filters URLs based on a file of regular expressions using host/domains @@ -120,7 +121,7 @@ public class FastURLFilter implements URLFilter { try { reloadRules(); } catch (Exception e) { - LOG.error(e.getMessage()); + LOG.error("Failed to load rules: {}", e.getMessage() ); throw new RuntimeException(e.getMessage(), e); } } @@ -193,13 +194,14 @@ public class FastURLFilter implements URLFilter { if (fileRulesPath.toUri().getScheme() != null) { FileSystem fs = fileRulesPath.getFileSystem(conf); is = fs.open(fileRulesPath); - } - else { + } else { is = conf.getConfResourceAsInputStream(fileRules); } - if (fileRules.endsWith(".gz")) { - is = new GZIPInputStream(is); + CompressionCodec codec = new CompressionCodecFactory(conf) + .getCodec(fileRulesPath); + if (codec != null) { + is = codec.createInputStream(is); } reloadRules(new InputStreamReader(is));