Author: tallison Date: Mon Aug 4 16:51:40 2014 New Revision: 1615675 URL: http://svn.apache.org/r1615675 Log: TIKA-1317 extract contents from SDTs within cells in tables in XWPF (docx) files
Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1615675&r1=1615674&r2=1615675&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Mon Aug 4 16:51:40 2014 @@ -29,8 +29,10 @@ import org.apache.poi.xwpf.model.XWPFHea import org.apache.poi.xwpf.usermodel.BodyType; import org.apache.poi.xwpf.usermodel.IBody; import org.apache.poi.xwpf.usermodel.IBodyElement; +import org.apache.poi.xwpf.usermodel.ICell; import org.apache.poi.xwpf.usermodel.IRunElement; import org.apache.poi.xwpf.usermodel.ISDTContent; +import org.apache.poi.xwpf.usermodel.XWPFSDTCell; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter; import org.apache.poi.xwpf.usermodel.XWPFHyperlink; @@ -332,10 +334,14 @@ public class XWPFWordExtractorDecorator xhtml.startElement("tbody"); for(XWPFTableRow row : table.getRows()) { xhtml.startElement("tr"); - for(XWPFTableCell cell : row.getTableCells()) { - xhtml.startElement("td"); - extractIBodyText(cell, xhtml); - xhtml.endElement("td"); + for(ICell cell : row.getTableICells()){ + xhtml.startElement("td"); + if (cell instanceof XWPFTableCell) { + extractIBodyText((XWPFTableCell)cell, xhtml); + } else if (cell instanceof XWPFSDTCell) { + xhtml.characters(((XWPFSDTCell)cell).getContent().getText()); + } + xhtml.endElement("td"); } xhtml.endElement("tr"); } Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1615675&r1=1615674&r2=1615675&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Mon Aug 4 16:51:40 2014 @@ -987,6 +987,7 @@ public class OOXMLParserTest extends Tik /** * Test for missing text described in * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>. + * and TIKA-1317 */ @Test public void testMissingText() throws Exception { @@ -1002,6 +1003,7 @@ public class OOXMLParserTest extends Tik metadata.get(Metadata.CONTENT_TYPE)); assertTrue(handler.toString().contains("BigCompany")); assertTrue(handler.toString().contains("Seasoned")); + assertTrue(handler.toString().contains("Rich_text_in_cell")); } finally { input.close(); }