Author: jukka
Date: Tue Mar 24 11:49:38 2009
New Revision: 757747

URL: http://svn.apache.org/viewvc?rev=757747&view=rev
Log:
TIKA-210: html content directly under body node not parsed correctly

Make sure that the XHTML document has started when the first characters are 
seen.

Modified:
    lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
    
lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=757747&r1=757746&r2=757747&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
(original)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
Tue Mar 24 11:49:38 2009
@@ -186,6 +186,16 @@
         }
     }
 
+    /**
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-210";>TIKA-210</a>
+     */
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        lazyStartDocument();
+        super.characters(ch, start, length);
+    }
+
     //------------------------------------------< public convenience methods >
 
     public void startElement(String name) throws SAXException {

Modified: 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=757747&r1=757746&r2=757747&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java 
(original)
+++ 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java 
Tue Mar 24 11:49:38 2009
@@ -134,4 +134,18 @@
         assertEquals("", content);
     }
 
+    /**
+     * Test case for TIKA-210
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-210";>TIKA-210</a>
+     */
+    public void testCharactersDirectlyUnderBodyElement() throws Exception {
+        String test = "<html><body>test</body></html>";
+        ContentHandler handler = new BodyContentHandler();
+        parser.parse(
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                handler, new Metadata());
+        String content = handler.toString();
+        assertEquals("test", content);
+    }
+
 }


Reply via email to