This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 7e6eabb NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages 7e6eabb is described below commit 7e6eabbc2b0a0b5ee91148a9effc6447af5057ba Author: Markus Jelsma <mar...@apache.org> AuthorDate: Thu Apr 11 12:32:22 2019 +0200 NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages --- conf/nutch-default.xml | 9 +++++++++ .../org/apache/nutch/parse/tika/TikaParser.java | 22 ++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index a4b202f..951494e 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1600,6 +1600,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this </description> </property> +<property> + <name>tika.extractor.boilerpipe.mime.types</name> + <value>text/html,application/xhtml+xml</value> + <description> + Comma-separated list of MIME types accepted for Boilerpipe extraction, + documents of other MIME types are not passed to the Boilerpipe extractor. + </description> +</property> + <!-- urlfilter plugin properties --> <property> diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 7440333..40aa265 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -21,8 +21,11 @@ import java.io.ByteArrayInputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -73,6 +76,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { private boolean upperCaseElementNames = true; private String boilerpipeExtractorName; private boolean useBoilerpipe; + private Set<String> boilerpipeMimeTypes; public ParseResult getParse(Content content) { HTMLDocumentImpl doc = new HTMLDocumentImpl(); @@ -114,7 +118,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { ContentHandler domHandler; // Check whether to use Tika's BoilerplateContentHandler - if (useBoilerpipe) { + if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) { BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler( (ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); @@ -291,16 +295,18 @@ public class TikaParser implements org.apache.nutch.parse.Parser { } } - htmlParseFilters = new HtmlParseFilters(getConf()); + htmlParseFilters = new HtmlParseFilters(conf); utils = new DOMContentUtils(conf); - cachingPolicy = getConf().get("parser.caching.forbidden.policy", + cachingPolicy = conf.get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT); - upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names", + upperCaseElementNames = conf.getBoolean("tika.uppercase.element.names", true); - useBoilerpipe = getConf().get("tika.extractor", "none") - .equals("boilerpipe"); - boilerpipeExtractorName = getConf() - .get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor"); + useBoilerpipe = conf.get("tika.extractor", "none").equals("boilerpipe"); + boilerpipeExtractorName = conf.get("tika.extractor.boilerpipe.algorithm", + "ArticleExtractor"); + boilerpipeMimeTypes = new HashSet<>(Arrays + .asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types", + "text/html", "application/xhtml+xml"))); } public Configuration getConf() {