Author: jukka
Date: Tue Mar 24 11:49:38 2009
New Revision: 757747
URL: http://svn.apache.org/viewvc?rev=757747&view=rev
Log:
TIKA-210: html content directly under body node not parsed correctly
Make sure that the XHTML document has started when the first characters are
seen.
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=757747&r1=757746&r2=757747&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
Tue Mar 24 11:49:38 2009
@@ -186,6 +186,16 @@
}
}
+ /**
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
+ */
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ lazyStartDocument();
+ super.characters(ch, start, length);
+ }
+
//------------------------------------------< public convenience methods >
public void startElement(String name) throws SAXException {
Modified:
lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=757747&r1=757746&r2=757747&view=diff
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Tue Mar 24 11:49:38 2009
@@ -134,4 +134,18 @@
assertEquals("", content);
}
+ /**
+ * Test case for TIKA-210
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
+ */
+ public void testCharactersDirectlyUnderBodyElement() throws Exception {
+ String test = "<html><body>test</body></html>";
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ handler, new Metadata());
+ String content = handler.toString();
+ assertEquals("test", content);
+ }
+
}