Author: jukka
Date: Sat Aug 22 20:41:34 2009
New Revision: 806887
URL: http://svn.apache.org/viewvc?rev=806887&view=rev
Log:
TIKA-268: HTMLParser omits necessary space-characters when parsing table-data
Updated the SAFE_ELEMENTS map in HtmlParser as suggested by Uwe Schindler.
Added a test case.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=806887&r1=806886&r2=806887&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Sat Aug 22 20:41:34 2009
@@ -76,7 +76,12 @@
SAFE_ELEMENTS.put("DD", "dd");
SAFE_ELEMENTS.put("PRE", "pre");
SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
- SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
+ SAFE_ELEMENTS.put("TABLE", "table");
+ SAFE_ELEMENTS.put("THEAD", "thead");
+ SAFE_ELEMENTS.put("TBODY", "tbody");
+ SAFE_ELEMENTS.put("TR", "tr");
+ SAFE_ELEMENTS.put("TH", "th");
+ SAFE_ELEMENTS.put("TD", "td");
DISCARD_ELEMENTS.add("STYLE");
DISCARD_ELEMENTS.add("SCRIPT");
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=806887&r1=806886&r2=806887&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Sat Aug 22 20:41:34 2009
@@ -148,4 +148,21 @@
assertEquals("test", content);
}
+ /**
+ * Test case for TIKA-268
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
+ */
+ public void testWhitespaceBetweenTableCells() throws Exception {
+ String test =
+
"<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ handler, new Metadata());
+ String content = handler.toString();
+ assertTrue(content.contains("a"));
+ assertTrue(content.contains("b"));
+ assertFalse(content.contains("ab"));
+ }
+
}