Author: jukka
Date: Sun Dec 13 21:15:00 2009
New Revision: 890121

URL: http://svn.apache.org/viewvc?rev=890121&view=rev
Log:
TIKA-343: some parsers produces glued words

Add custom handling for <br> tags unless otherwise specified by the HtmlMapper.

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=890121&r1=890120&r2=890121&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Sun Dec 13 21:15:00 2009
@@ -133,6 +133,10 @@
                 xhtml.endElement(safe);
             } else if ("A".equals(name)) {
                 xhtml.endElement("a");
+            } else if ("BR".equals(name)) {
+                // TIKA-343: Map <br> tags to newlines, unless the HtmlMapper
+                // above has already determined to map them to something else
+                xhtml.characters("\n");
             }
         }
 

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890121&r1=890120&r2=890121&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Sun Dec 13 21:15:00 2009
@@ -273,4 +273,21 @@
         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
     }
 
+    /**
+     * Test case for HTML content like "foo&gt;br&lt;bar" that should result
+     * in two whitespace-separated tokens "foo" and "bar" instead of a single
+     * token "foobar".
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-343";>TIKA-343</a>
+     */
+    public void testLineBreak() throws Exception {
+        String test = "<html><body><p>foo<br>bar</p></body></html>";
+        String text = new Tika().parseToString(
+                new ByteArrayInputStream(test.getBytes("US-ASCII")));
+        String[] parts = text.trim().split("\\s+");
+        assertEquals(2, parts.length);
+        assertEquals("foo", parts[0]);
+        assertEquals("bar", parts[1]);
+    }
+
 }


Reply via email to