Author: jukka
Date: Sat Aug 22 20:41:34 2009
New Revision: 806887

URL: http://svn.apache.org/viewvc?rev=806887&view=rev
Log:
TIKA-268: HTMLParser omits necessary space-characters when parsing table-data

Updated the SAFE_ELEMENTS map in HtmlParser as suggested by Uwe Schindler. 
Added a test case.

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=806887&r1=806886&r2=806887&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Sat Aug 22 20:41:34 2009
@@ -76,7 +76,12 @@
         SAFE_ELEMENTS.put("DD", "dd");
         SAFE_ELEMENTS.put("PRE", "pre");
         SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
-        SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
+        SAFE_ELEMENTS.put("TABLE", "table");
+        SAFE_ELEMENTS.put("THEAD", "thead");
+        SAFE_ELEMENTS.put("TBODY", "tbody");
+        SAFE_ELEMENTS.put("TR", "tr");
+        SAFE_ELEMENTS.put("TH", "th");
+        SAFE_ELEMENTS.put("TD", "td");
 
         DISCARD_ELEMENTS.add("STYLE");
         DISCARD_ELEMENTS.add("SCRIPT");

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=806887&r1=806886&r2=806887&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Sat Aug 22 20:41:34 2009
@@ -148,4 +148,21 @@
         assertEquals("test", content);
     }
 
+    /**
+     * Test case for TIKA-268
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-268";>TIKA-268</a>
+     */
+    public void testWhitespaceBetweenTableCells() throws Exception {
+        String test =
+            
"<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
+        ContentHandler handler = new BodyContentHandler();
+        parser.parse(
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                handler, new Metadata());
+        String content = handler.toString();
+        assertTrue(content.contains("a"));
+        assertTrue(content.contains("b"));
+        assertFalse(content.contains("ab"));
+    }
+
 }


Reply via email to