Author: jnioche
Date: Fri Nov  7 09:59:06 2014
New Revision: 1637325

URL: http://svn.apache.org/r1637325
Log:
NUTCH-1887 Specify HTMLMapper to use in TikaParser

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Nov  7 09:59:06 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1887 Specify HTMLMapper to use in TikaParser (jnioche)
+
 * NUTCH-1884 NullPointerException in parsechecker and indexchecker with 
symlinks in file URL (Mengying Wang, snagel)
 
 * NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Nov  7 09:59:06 2014
@@ -1190,6 +1190,16 @@
   </description>
 </property>
 
+<!--
+<property>
+  <name>tika.htmlmapper.classname</name>
+  <value>org.apache.tika.parser.html.IdentityHtmlMapper</value>
+  <description>Classname of Tika HTMLMapper to use. Influences the elements 
included in the DOM and hence
+  the behaviour of the HTMLParseFilters.
+  </description>
+</property>
+-->
+
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1637325&r1=1637324&r2=1637325&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 Fri Nov  7 09:59:06 2014
@@ -22,6 +22,7 @@ import java.net.URL;
 import java.util.ArrayList;
 import java.util.Map;
 
+import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.nutch.metadata.Nutch;
@@ -40,6 +41,7 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
@@ -58,9 +60,10 @@ public class TikaParser implements org.a
        private DOMContentUtils utils;
        private HtmlParseFilters htmlParseFilters;
        private String cachingPolicy;
+       private HtmlMapper HTMLMapper;
 
        @SuppressWarnings("deprecation")
-  public ParseResult getParse(Content content) {
+       public ParseResult getParse(Content content) {
                String mimeType = content.getContentType();
 
                URL base;
@@ -93,11 +96,14 @@ public class TikaParser implements org.a
                DocumentFragment root = doc.createDocumentFragment();
                DOMBuilder domhandler = new DOMBuilder(doc, root);
                ParseContext context = new ParseContext();
+               if (HTMLMapper != null)
+                       context.set(HtmlMapper.class, HTMLMapper);
                tikamd.set(Metadata.CONTENT_TYPE, mimeType);
                try {
-                 parser.parse(new ByteArrayInputStream(raw), domhandler, 
tikamd,context);
+                       parser.parse(new ByteArrayInputStream(raw), domhandler, 
tikamd,
+                                       context);
                } catch (Exception e) {
-                       LOG.error("Error parsing "+content.getUrl(),e);
+                       LOG.error("Error parsing " + content.getUrl(), e);
                        return new ParseStatus(ParseStatus.FAILED, 
e.getMessage())
                                        .getEmptyParseResult(content.getUrl(), 
getConf());
                }
@@ -168,18 +174,18 @@ public class TikaParser implements org.a
                        status.setArgs(new String[] { 
metaTags.getRefreshHref().toString(),
                                        
Integer.toString(metaTags.getRefreshTime()) });
                }
-               ParseData parseData = new ParseData(status, title, outlinks, 
content
-                               .getMetadata(), nutchMetadata);
-               ParseResult parseResult = ParseResult.createParseResult(content
-                               .getUrl(), new ParseImpl(text, parseData));
+               ParseData parseData = new ParseData(status, title, outlinks,
+                               content.getMetadata(), nutchMetadata);
+               ParseResult parseResult = ParseResult.createParseResult(
+                               content.getUrl(), new ParseImpl(text, 
parseData));
 
                // run filters on parse
                ParseResult filteredParse = 
this.htmlParseFilters.filter(content,
                                parseResult, metaTags, root);
                if (metaTags.getNoCache()) { // not okay to cache
                        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry 
: filteredParse)
-                               entry.getValue().getData().getParseMeta().set(
-                                               Nutch.CACHING_FORBIDDEN_KEY, 
cachingPolicy);
+                               entry.getValue().getData().getParseMeta()
+                                               
.set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
                }
                return filteredParse;
        }
@@ -189,7 +195,7 @@ public class TikaParser implements org.a
                this.tikaConfig = null;
 
                // do we want a custom Tika configuration file
-               // deprecated since Tika 0.7 which is based on 
+               // deprecated since Tika 0.7 which is based on
                // a service provider based configuration
                String customConfFile = conf.get("tika.config.file");
                if (customConfFile != null) {
@@ -212,6 +218,26 @@ public class TikaParser implements org.a
                        }
                }
 
+               // use a custom htmlmapper
+               String htmlmapperClassName = 
conf.get("tika.htmlmapper.classname");
+               if (StringUtils.isNotBlank(htmlmapperClassName)) {
+                       try {
+                               Class HTMLMapperClass = 
Class.forName(htmlmapperClassName);
+                               boolean interfaceOK = HtmlMapper.class
+                                               
.isAssignableFrom(HTMLMapperClass);
+                               if (!interfaceOK) {
+                                       throw new RuntimeException("Class " + 
htmlmapperClassName
+                                                       + " does not implement 
HtmlMapper");
+                               }
+                               HTMLMapper = (HtmlMapper) 
HTMLMapperClass.newInstance();
+                       } catch (Exception e) {
+                               LOG.error("Can't generate instance for class "
+                                               + htmlmapperClassName);
+                               throw new RuntimeException("Can't generate 
instance for class "
+                                               + htmlmapperClassName);
+                       }
+               }
+
                this.htmlParseFilters = new HtmlParseFilters(getConf());
                this.utils = new DOMContentUtils(conf);
                this.cachingPolicy = 
getConf().get("parser.caching.forbidden.policy",


Reply via email to