This is an automated email from the ASF dual-hosted git repository.
markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 7e6eabb NUTCH-2703 parse-tika: Boilerpipe should not run for
non-(X)HTML pages
7e6eabb is described below
commit 7e6eabbc2b0a0b5ee91148a9effc6447af5057ba
Author: Markus Jelsma <[email protected]>
AuthorDate: Thu Apr 11 12:32:22 2019 +0200
NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages
---
conf/nutch-default.xml | 9 +++++++++
.../org/apache/nutch/parse/tika/TikaParser.java | 22 ++++++++++++++--------
2 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a4b202f..951494e 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1600,6 +1600,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value
than 30, when using this
</description>
</property>
+<property>
+ <name>tika.extractor.boilerpipe.mime.types</name>
+ <value>text/html,application/xhtml+xml</value>
+ <description>
+ Comma-separated list of MIME types accepted for Boilerpipe extraction,
+ documents of other MIME types are not passed to the Boilerpipe extractor.
+ </description>
+</property>
+
<!-- urlfilter plugin properties -->
<property>
diff --git
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 7440333..40aa265 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -21,8 +21,11 @@ import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
@@ -73,6 +76,7 @@ public class TikaParser implements
org.apache.nutch.parse.Parser {
private boolean upperCaseElementNames = true;
private String boilerpipeExtractorName;
private boolean useBoilerpipe;
+ private Set<String> boilerpipeMimeTypes;
public ParseResult getParse(Content content) {
HTMLDocumentImpl doc = new HTMLDocumentImpl();
@@ -114,7 +118,7 @@ public class TikaParser implements
org.apache.nutch.parse.Parser {
ContentHandler domHandler;
// Check whether to use Tika's BoilerplateContentHandler
- if (useBoilerpipe) {
+ if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) {
BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(
(ContentHandler) new DOMBuilder(doc, root),
BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
@@ -291,16 +295,18 @@ public class TikaParser implements
org.apache.nutch.parse.Parser {
}
}
- htmlParseFilters = new HtmlParseFilters(getConf());
+ htmlParseFilters = new HtmlParseFilters(conf);
utils = new DOMContentUtils(conf);
- cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+ cachingPolicy = conf.get("parser.caching.forbidden.policy",
Nutch.CACHING_FORBIDDEN_CONTENT);
- upperCaseElementNames =
getConf().getBoolean("tika.uppercase.element.names",
+ upperCaseElementNames = conf.getBoolean("tika.uppercase.element.names",
true);
- useBoilerpipe = getConf().get("tika.extractor", "none")
- .equals("boilerpipe");
- boilerpipeExtractorName = getConf()
- .get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
+ useBoilerpipe = conf.get("tika.extractor", "none").equals("boilerpipe");
+ boilerpipeExtractorName = conf.get("tika.extractor.boilerpipe.algorithm",
+ "ArticleExtractor");
+ boilerpipeMimeTypes = new HashSet<>(Arrays
+ .asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types",
+ "text/html", "application/xhtml+xml")));
}
public Configuration getConf() {