Author: jnioche
Date: Mon Dec 15 16:55:00 2014
New Revision: 1645697
URL: http://svn.apache.org/r1645697
Log:
NUTCH-1888 Specify HTMLMapper to use in TikaParser (Halil Simsek via jnioche)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1645697&r1=1645696&r2=1645697&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Dec 15 16:55:00 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.3-SNAPSHOT
+* NUTCH-1888 Specify HTMLMapper to use in TikaParser (Halil Simsek via jnioche)
+
* NUTCH-1897 Easier debugging of plugin XML errors (markus)
* NUTCH-1823 Upgrade to elasticsearch 1.4.1 (Phu Kieu, markus, lewismc)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1645697&r1=1645696&r2=1645697&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Mon Dec 15 16:55:00 2014
@@ -998,6 +998,16 @@
</description>
</property>
+<!--
+<property>
+ <name>tika.htmlmapper.classname</name>
+ <value>org.apache.tika.parser.html.IdentityHtmlMapper</value>
+ <description>Classname of Tika HTMLMapper to use. Influences the elements
included in the DOM and hence
+ the behaviour of the HTMLParseFilters.
+ </description>
+</property>
+-->
+
<!-- urlfilter plugin properties -->
<property>
Modified:
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1645697&r1=1645696&r2=1645697&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Mon Dec 15 16:55:00 2014
@@ -17,6 +17,7 @@
package org.apache.nutch.parse.tika;
import org.apache.avro.util.Utf8;
+import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Nutch;
@@ -32,6 +33,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
@@ -70,6 +72,8 @@ public class TikaParser implements org.a
private ParseFilters htmlParseFilters;
private String cachingPolicy;
+ private HtmlMapper HTMLMapper;
+
@Override
public Parse getParse(String url, WebPage page) {
@@ -93,8 +97,8 @@ public class TikaParser implements org.a
message, getConf());
}
- LOG.debug("Using Tika parser " + parser.getClass().getName() + " for
mime-type "
- + mimeType);
+ LOG.debug("Using Tika parser " + parser.getClass().getName()
+ + " for mime-type " + mimeType);
Metadata tikamd = new Metadata();
@@ -103,14 +107,16 @@ public class TikaParser implements org.a
DocumentFragment root = doc.createDocumentFragment();
DOMBuilder domhandler = new DOMBuilder(doc, root);
ParseContext context = new ParseContext();
+ if (HTMLMapper != null)
+ context.set(HtmlMapper.class, HTMLMapper);
// to add once available in Tika
// context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
tikamd.set(Metadata.CONTENT_TYPE, mimeType);
try {
- parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() +
raw.position(),
- raw.remaining()), domhandler, tikamd, context);
+ parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset()
+ + raw.position(), raw.remaining()), domhandler, tikamd, context);
} catch (Exception e) {
- LOG.error("Error parsing "+url,e);
+ LOG.error("Error parsing " + url, e);
return ParseStatusUtils.getEmptyParse(e, getConf());
}
@@ -160,10 +166,10 @@ public class TikaParser implements org.a
String[] TikaMDNames = tikamd.names();
for (String tikaMDName : TikaMDNames) {
if (tikaMDName.equalsIgnoreCase(TikaCoreProperties.TITLE.toString()))
- continue;
+ continue;
// TODO what if multivalued?
- page.getMetadata().put(new Utf8(tikaMDName),
ByteBuffer.wrap(Bytes.toBytes(tikamd
- .get(tikaMDName))));
+ page.getMetadata().put(new Utf8(tikaMDName),
+ ByteBuffer.wrap(Bytes.toBytes(tikamd.get(tikaMDName))));
}
// no outlinks? try OutlinkExtractor e.g works for mime types where no
@@ -175,17 +181,18 @@ public class TikaParser implements org.a
ParseStatus status = ParseStatusUtils.STATUS_SUCCESS;
if (metaTags.getRefresh()) {
- status.setMinorCode((int)ParseStatusCodes.SUCCESS_REDIRECT);
+ status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT);
status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString()));
- status.getArgs().add(new
Utf8(Integer.toString(metaTags.getRefreshTime())));
+ status.getArgs().add(
+ new Utf8(Integer.toString(metaTags.getRefreshTime())));
}
Parse parse = new Parse(text, title, outlinks, status);
parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
if (metaTags.getNoCache()) { // not okay to cache
- page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
ByteBuffer.wrap(Bytes
- .toBytes(cachingPolicy)));
+ page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
+ ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
}
return parse;
@@ -203,16 +210,35 @@ public class TikaParser implements org.a
throw new RuntimeException(e2);
}
+ // use a custom htmlmapper
+ String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
+ if (StringUtils.isNotBlank(htmlmapperClassName)) {
+ try {
+ Class HTMLMapperClass = Class.forName(htmlmapperClassName);
+ boolean interfaceOK = HtmlMapper.class
+ .isAssignableFrom(HTMLMapperClass);
+ if (!interfaceOK) {
+ throw new RuntimeException("Class " + htmlmapperClassName
+ + " does not implement HtmlMapper");
+ }
+ HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
+ } catch (Exception e) {
+ LOG.error("Can't generate instance for class " + htmlmapperClassName);
+ throw new RuntimeException("Can't generate instance for class "
+ + htmlmapperClassName);
+ }
+ }
+
this.htmlParseFilters = new ParseFilters(getConf());
this.utils = new DOMContentUtils(conf);
this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
Nutch.CACHING_FORBIDDEN_CONTENT);
}
- public TikaConfig getTikaConfig(){
- return this.tikaConfig;
+ public TikaConfig getTikaConfig() {
+ return this.tikaConfig;
}
-
+
public Configuration getConf() {
return this.conf;
}