Author: jukka
Date: Sun Dec 13 22:01:51 2009
New Revision: 890126

URL: http://svn.apache.org/viewvc?rev=890126&view=rev
Log:
TIKA-343: some parsers produces glued words

Use the newline rules from XHTMLContentHandler in HtmlHandler in cases where 
the HtmlMapper does not map incoming HTML element to something different.

Modified:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
    
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=890126&r1=890125&r2=890126&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
 Sun Dec 13 22:01:51 2009
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.sax;
 
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -49,44 +51,21 @@
     /**
      * The elements that get prepended with the {...@link #TAB} character.
      */
-    private static final Set<String> INDENT = new HashSet<String>() {{
-        add("li");
-        add("dd");
-        add("dt");
-        add("td");
-        add("th");
-    }};
+    private static final Set<String> INDENT =
+        unmodifiableSet("li", "dd", "dt", "td", "th");
 
     /**
      * The elements that get appended with the {...@link #NL} character.
      */
-    private static final Set<String> ENDLINE = new HashSet<String>() {{
-        add("p");
-        add("h1");
-        add("h2");
-        add("h3");
-        add("h4");
-        add("h5");
-        add("h6");
-        add("div");
-        add("ul");
-        add("ol");
-        add("dl");
-        add("pre");
-        add("hr");
-        add("blockquote");
-        add("address");
-        add("fieldset");
-        add("table");
-        add("form");
-        add("noscript");
-        add("li");
-        add("dt");
-        add("dd");
-        add("noframes");
-        add("br");
-        add("tr");
-    }};
+    public static final Set<String> ENDLINE = unmodifiableSet(
+            "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
+            "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
+            "noscript", "li", "dt", "dd", "noframes", "br", "tr");
+
+    private static Set<String> unmodifiableSet(String... elements) {
+        return Collections.unmodifiableSet(
+                new HashSet<String>(Arrays.asList(elements)));
+    }
 
     /**
      * Metadata associated with the document. Used to fill in the
@@ -182,7 +161,7 @@
             throws SAXException {
         super.endElement(uri, local, name);
         if (XHTML.equals(uri) && ENDLINE.contains(local)) {
-            ignorableWhitespace(NL, 0, NL.length);
+            newline();
         }
     }
 
@@ -217,6 +196,10 @@
         characters(characters.toCharArray(), 0, characters.length());
     }
 
+    public void newline() throws SAXException {
+        ignorableWhitespace(NL, 0, NL.length);
+    }
+
     public void element(String name, String value) throws SAXException {
         startElement(name);
         characters(value);

Modified: 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=890126&r1=890125&r2=890126&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
 Sun Dec 13 22:01:51 2009
@@ -64,6 +64,7 @@
         xhtml.endElement("table");
         xhtml.endDocument();
 
+        System.out.println(output);
         String[] words = output.toString().split("\\s+");
         assertEquals(6, words.length);
         assertEquals("foo", words[0]);


Reply via email to