[nutch] branch master updated: NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages

markus Thu, 11 Apr 2019 03:34:23 -0700

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new 7e6eabb  NUTCH-2703 parse-tika: Boilerpipe should not run for 
non-(X)HTML pages
7e6eabb is described below

commit 7e6eabbc2b0a0b5ee91148a9effc6447af5057ba
Author: Markus Jelsma <[email protected]>
AuthorDate: Thu Apr 11 12:32:22 2019 +0200

    NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages
---
 conf/nutch-default.xml                             |  9 +++++++++
 .../org/apache/nutch/parse/tika/TikaParser.java    | 22 ++++++++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a4b202f..951494e 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1600,6 +1600,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value 
than 30, when using this
   </description>
 </property>
 
+<property>
+  <name>tika.extractor.boilerpipe.mime.types</name>
+  <value>text/html,application/xhtml+xml</value>
+  <description>
+    Comma-separated list of MIME types accepted for Boilerpipe extraction,
+    documents of other MIME types are not passed to the Boilerpipe extractor.
+  </description>
+</property>
+
 <!-- urlfilter plugin properties -->
 
 <property>
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 7440333..40aa265 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -21,8 +21,11 @@ import java.io.ByteArrayInputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
@@ -73,6 +76,7 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
   private boolean upperCaseElementNames = true;
   private String boilerpipeExtractorName;
   private boolean useBoilerpipe;
+  private Set<String> boilerpipeMimeTypes;
 
   public ParseResult getParse(Content content) {
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
@@ -114,7 +118,7 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
     ContentHandler domHandler;
 
     // Check whether to use Tika's BoilerplateContentHandler
-    if (useBoilerpipe) {
+    if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) {
       BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(
           (ContentHandler) new DOMBuilder(doc, root),
           BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
@@ -291,16 +295,18 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
       }
     }
 
-    htmlParseFilters = new HtmlParseFilters(getConf());
+    htmlParseFilters = new HtmlParseFilters(conf);
     utils = new DOMContentUtils(conf);
-    cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+    cachingPolicy = conf.get("parser.caching.forbidden.policy",
         Nutch.CACHING_FORBIDDEN_CONTENT);
-    upperCaseElementNames = 
getConf().getBoolean("tika.uppercase.element.names",
+    upperCaseElementNames = conf.getBoolean("tika.uppercase.element.names",
         true);
-    useBoilerpipe = getConf().get("tika.extractor", "none")
-        .equals("boilerpipe");
-    boilerpipeExtractorName = getConf()
-        .get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
+    useBoilerpipe = conf.get("tika.extractor", "none").equals("boilerpipe");
+    boilerpipeExtractorName = conf.get("tika.extractor.boilerpipe.algorithm",
+        "ArticleExtractor");
+    boilerpipeMimeTypes = new HashSet<>(Arrays
+        .asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types",
+            "text/html", "application/xhtml+xml")));
   }
 
   public Configuration getConf() {

[nutch] branch master updated: NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages

Reply via email to