S3 and support gzipped input - use Hadoop-provided compression codecs - update description of property urlfilter.fast.file

snagel Wed, 08 Nov 2023 04:38:33 -0800

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


commit ac383fc5125b6c114a23ef996558ead57e873970
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Wed Nov 8 12:24:24 2023 +0100

    [NUTCH-3017] Allow fast-urlfilter to load from HDFS/S3 and support gzipped 
input
    - use Hadoop-provided compression codecs
    - update description of property urlfilter.fast.file
---
 conf/nutch-default.xml                                     | 10 ++++++++--
 .../org/apache/nutch/urlfilter/fast/FastURLFilter.java     | 14 ++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index d8bf76486..b20afdfe3 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1872,8 +1872,14 @@ CAUTION: Set the parser.timeout to -1 or a bigger value 
than 30, when using this
 <property>
   <name>urlfilter.fast.file</name>
   <value>fast-urlfilter.txt</value>
-  <description>Name of file on CLASSPATH containing regular expressions
-  used by urlfilter-fast (FastURLFilter) plugin.</description>
+  <description>Name of file containing rules and regular expressions
+  used by urlfilter-fast (FastURLFilter) plugin. If the filename
+  includes a scheme (for example, hdfs://) it is loaded using the
+  Hadoop FileSystem implementation supporting that scheme. If the
+  filename does not contain a scheme, the file is loaded from
+  CLASSPATH. If indicated by file extension (.gz, .bzip2, .zst),
+  the file is decompressed while reading using Hadoop-provided
+  compression codecs.</description>
 </property>
 
 <property>
diff --git 
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
 
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
index 79ad7b6ca..bb4a11b7c 100644
--- 
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+++ 
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -21,6 +21,8 @@ import com.google.common.collect.Multimap;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.net.URLFilter;
 import org.slf4j.Logger;
@@ -35,7 +37,6 @@ import java.io.Reader;
 import java.net.URL;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
-import java.util.zip.GZIPInputStream;
 
 /**
  * Filters URLs based on a file of regular expressions using host/domains
@@ -120,7 +121,7 @@ public class FastURLFilter implements URLFilter {
     try {
       reloadRules();
     } catch (Exception e) {
-      LOG.error(e.getMessage());
+      LOG.error("Failed to load rules: {}", e.getMessage()  );
       throw new RuntimeException(e.getMessage(), e);
     }
   }
@@ -193,13 +194,14 @@ public class FastURLFilter implements URLFilter {
     if (fileRulesPath.toUri().getScheme() != null) {
       FileSystem fs = fileRulesPath.getFileSystem(conf);
       is = fs.open(fileRulesPath);
-    }
-    else {
+    } else {
       is = conf.getConfResourceAsInputStream(fileRules);
     }
 
-    if (fileRules.endsWith(".gz")) {
-      is = new GZIPInputStream(is);
+    CompressionCodec codec = new CompressionCodecFactory(conf)
+        .getCodec(fileRulesPath);
+    if (codec != null) {
+      is = codec.createInputStream(is);
     }
 
     reloadRules(new InputStreamReader(is));

(nutch) 01/02: [NUTCH-3017] Allow fast-urlfilter to load from HDFS/S3 and support gzipped input - use Hadoop-provided compression codecs - update description of property urlfilter.fast.file

Reply via email to