Author: jnioche
Date: Thu Dec 11 11:40:40 2014
New Revision: 1644604

URL: http://svn.apache.org/r1644604
Log:
NUTCH-1592 TikaParser can uppercase the element names while generating the DOM

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
    
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Dec 11 11:40:40 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1592 TikaParser can uppercase the element names while generating the 
DOM (jnioche)
+
 * NUTCH-1877 Suffix URL filter to ignore query string by default (markus via 
snagel)
 
 * NUTCH-1890 Major Typo in Documentation for Integrating Nutch and Solr (Boadu 
Akoto Charles Jnr, mattmann)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Dec 11 11:40:40 2014
@@ -1200,6 +1200,15 @@
 </property>
 -->
 
+<property>
+  <name>tika.uppercase.element.names</name>
+  <value>true</value>
+  <description>Determines whether TikaParser should uppercase the element name 
while generating the DOM
+  for a page, as done by Neko (used per default by parse-html)(see NUTCH-1592).
+  </description>
+</property>
+
+
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
 Thu Dec 11 11:40:40 2014
@@ -47,6 +47,7 @@ import org.xml.sax.ext.LexicalHandler;
 class DOMBuilder
         implements ContentHandler, LexicalHandler
 {
+    private boolean upperCaseElementNames = true;
 
   /** Root document          */
   public Document m_doc;
@@ -265,7 +266,10 @@ class DOMBuilder
   {
 
     Element elem;
-
+    
+    if (upperCaseElementNames)
+        name = name.toUpperCase();
+    
        // Note that the namespace-aware call must be used to correctly
        // construct a Level 2 DOM, even for non-namespaced nodes.
     if ((null == ns) || (ns.length() == 0))
@@ -737,4 +741,12 @@ class DOMBuilder
    *        parameter entity, the name will begin with '%'.
    */
   public void skippedEntity(String name) throws org.xml.sax.SAXException{}
+  
+  public boolean isUpperCaseElementNames() {
+      return upperCaseElementNames;
+  }
+
+  public void setUpperCaseElementNames(boolean upperCaseElementNames) {
+      this.upperCaseElementNames = upperCaseElementNames;
+  }
 }

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1644604&r1=1644603&r2=1644604&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 Thu Dec 11 11:40:40 2014
@@ -61,6 +61,7 @@ public class TikaParser implements org.a
        private HtmlParseFilters htmlParseFilters;
        private String cachingPolicy;
        private HtmlMapper HTMLMapper;
+       private boolean upperCaseElementNames = true;
 
        @SuppressWarnings("deprecation")
        public ParseResult getParse(Content content) {
@@ -95,6 +96,7 @@ public class TikaParser implements org.a
                doc.setErrorChecking(false);
                DocumentFragment root = doc.createDocumentFragment();
                DOMBuilder domhandler = new DOMBuilder(doc, root);
+               domhandler.setUpperCaseElementNames(upperCaseElementNames);
                ParseContext context = new ParseContext();
                if (HTMLMapper != null)
                        context.set(HtmlMapper.class, HTMLMapper);
@@ -242,7 +244,8 @@ public class TikaParser implements org.a
                this.utils = new DOMContentUtils(conf);
                this.cachingPolicy = 
getConf().get("parser.caching.forbidden.policy",
                                Nutch.CACHING_FORBIDDEN_CONTENT);
-
+               this.upperCaseElementNames = getConf().getBoolean(
+                               "tika.uppercase.element.names", true);
        }
 
        public Configuration getConf() {


Reply via email to