Author: jukka
Date: Sun Dec 13 22:01:51 2009
New Revision: 890126
URL: http://svn.apache.org/viewvc?rev=890126&view=rev
Log:
TIKA-343: some parsers produces glued words
Use the newline rules from XHTMLContentHandler in HtmlHandler in cases where
the HtmlMapper does not map incoming HTML element to something different.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=890126&r1=890125&r2=890126&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
Sun Dec 13 22:01:51 2009
@@ -16,6 +16,8 @@
*/
package org.apache.tika.sax;
+import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
@@ -49,44 +51,21 @@
/**
* The elements that get prepended with the {...@link #TAB} character.
*/
- private static final Set<String> INDENT = new HashSet<String>() {{
- add("li");
- add("dd");
- add("dt");
- add("td");
- add("th");
- }};
+ private static final Set<String> INDENT =
+ unmodifiableSet("li", "dd", "dt", "td", "th");
/**
* The elements that get appended with the {...@link #NL} character.
*/
- private static final Set<String> ENDLINE = new HashSet<String>() {{
- add("p");
- add("h1");
- add("h2");
- add("h3");
- add("h4");
- add("h5");
- add("h6");
- add("div");
- add("ul");
- add("ol");
- add("dl");
- add("pre");
- add("hr");
- add("blockquote");
- add("address");
- add("fieldset");
- add("table");
- add("form");
- add("noscript");
- add("li");
- add("dt");
- add("dd");
- add("noframes");
- add("br");
- add("tr");
- }};
+ public static final Set<String> ENDLINE = unmodifiableSet(
+ "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
+ "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
+ "noscript", "li", "dt", "dd", "noframes", "br", "tr");
+
+ private static Set<String> unmodifiableSet(String... elements) {
+ return Collections.unmodifiableSet(
+ new HashSet<String>(Arrays.asList(elements)));
+ }
/**
* Metadata associated with the document. Used to fill in the
@@ -182,7 +161,7 @@
throws SAXException {
super.endElement(uri, local, name);
if (XHTML.equals(uri) && ENDLINE.contains(local)) {
- ignorableWhitespace(NL, 0, NL.length);
+ newline();
}
}
@@ -217,6 +196,10 @@
characters(characters.toCharArray(), 0, characters.length());
}
+ public void newline() throws SAXException {
+ ignorableWhitespace(NL, 0, NL.length);
+ }
+
public void element(String name, String value) throws SAXException {
startElement(name);
characters(value);
Modified:
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=890126&r1=890125&r2=890126&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
(original)
+++
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Sun Dec 13 22:01:51 2009
@@ -64,6 +64,7 @@
xhtml.endElement("table");
xhtml.endDocument();
+ System.out.println(output);
String[] words = output.toString().split("\\s+");
assertEquals(6, words.length);
assertEquals("foo", words[0]);