This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 348b87e7f41b79ff115e17d9c91d2dad63a57c15 Author: Nick Burch <n...@gagravarr.org> AuthorDate: Fri May 18 15:15:32 2018 +0100 TIKA-2479 Update XLS missing cell/row handling to match XLSX and XLSB, add unit test for missing rows, and enable the Columnar tests for the Excel formats --- .../tika/parser/microsoft/ExcelExtractor.java | 26 ++++++------ .../org/apache/tika/parser/TabularFormatsTest.java | 47 ++++++++++------------ .../tika/parser/microsoft/ExcelParserTest.java | 25 +++++++++++- 3 files changed, 60 insertions(+), 38 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java index 0dc33ee..ff5971a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java @@ -16,7 +16,7 @@ */ package org.apache.tika.parser.microsoft; -import java.awt.*; +import java.awt.Point; import java.io.IOException; import java.text.NumberFormat; import java.util.ArrayList; @@ -42,7 +42,6 @@ import org.apache.poi.hssf.record.CountryRecord; import org.apache.poi.hssf.record.DateWindow1904Record; import org.apache.poi.hssf.record.DrawingGroupRecord; import org.apache.poi.hssf.record.EOFRecord; -import org.apache.poi.hssf.record.ExtSSTRecord; import org.apache.poi.hssf.record.ExtendedFormatRecord; import org.apache.poi.hssf.record.FooterRecord; import org.apache.poi.hssf.record.FormatRecord; @@ -281,7 +280,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { public void processFile(DirectoryNode root, boolean listenForAllRecords) throws IOException, SAXException, TikaException { - // Set up listener and register the records we want to process HSSFRequest hssfRequest = new HSSFRequest(); if (listenForAllRecords) { @@ -494,15 +492,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { HeaderRecord headerRecord = (HeaderRecord) record; addTextCell(record, headerRecord.getText()); } - break; + break; case FooterRecord.sid: if (extractor.officeParserConfig.getIncludeHeadersAndFooters()) { FooterRecord footerRecord = (FooterRecord) record; addTextCell(record, footerRecord.getText()); } - break; - + break; } previousSid = record.getSid(); @@ -599,12 +596,17 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { handler.startElement("tr"); handler.startElement("td"); for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) { - while (currentRow < entry.getKey().y) { - handler.endElement("td"); - handler.endElement("tr"); - handler.startElement("tr"); - handler.startElement("td"); - currentRow++; + if (currentRow != entry.getKey().y) { + // We've moved onto a new row, possibly skipping some + do { + handler.endElement("td"); + handler.endElement("tr"); + handler.startElement("tr"); + handler.startElement("td"); + currentRow++; + } while (officeParserConfig.getIncludeMissingRows() && + currentRow < entry.getKey().y); + currentRow = entry.getKey().y; currentColumn = 0; } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java index 41139e2..4a52118 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java @@ -64,8 +64,8 @@ public class TabularFormatsTest extends TikaTest { "87.5%","88.9%","90.0%" }, new Pattern[] { - Pattern.compile("01-(01|JAN|Jan)-(60|1960)"), - Pattern.compile("02-01-1960"), + Pattern.compile("0?1-01-1960"), + Pattern.compile("0?2-01-1960"), Pattern.compile("17-01-1960"), Pattern.compile("22-03-1960"), Pattern.compile("13-09-1960"), @@ -77,17 +77,17 @@ public class TabularFormatsTest extends TikaTest { Pattern.compile("19-05-1987"), }, new Pattern[] { - Pattern.compile("01(JAN|Jan)(60|1960):00:00:01(.00)?"), - Pattern.compile("01(JAN|Jan)(60|1960):00:00:10(.00)?"), - Pattern.compile("01(JAN|Jan)(60|1960):00:01:40(.00)?"), - Pattern.compile("01(JAN|Jan)(60|1960):00:16:40(.00)?"), - Pattern.compile("01(JAN|Jan)(60|1960):02:46:40(.00)?"), - Pattern.compile("02(JAN|Jan)(60|1960):03:46:40(.00)?"), - Pattern.compile("12(JAN|Jan)(60|1960):13:46:40(.00)?"), - Pattern.compile("25(APR|Apr)(60|1960):17:46:40(.00)?"), - Pattern.compile("03(MAR|Mar)(63|1963):09:46:40(.00)?"), - Pattern.compile("09(SEP|Sep)(91|1991):01:46:40(.00)?"), - Pattern.compile("19(NOV|Nov)(76|2276):17:46:40(.00)?") + Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:00:01(.00)?"), + Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:00:10(.00)?"), + Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:01:40(.00)?"), + Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:16:40(.00)?"), + Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]02:46:40(.00)?"), + Pattern.compile("02(JAN|Jan)(60|1960)[:\\s]03:46:40(.00)?"), + Pattern.compile("12(JAN|Jan)(60|1960)[:\\s]13:46:40(.00)?"), + Pattern.compile("25(APR|Apr)(60|1960)[:\\s]17:46:40(.00)?"), + Pattern.compile("03(MAR|Mar)(63|1963)[:\\s]09:46:40(.00)?"), + Pattern.compile("09(SEP|Sep)(91|1991)[:\\s]01:46:40(.00)?"), + Pattern.compile("19(NOV|Nov)(76|2276)[:\\s]17:46:40(.00)?") }, new Pattern[] { Pattern.compile("0?0:00:01(.\\d\\d)?"), @@ -226,25 +226,22 @@ public class TabularFormatsTest extends TikaTest { XMLResult result = getXML("test-columnar.xls"); String xml = result.xml; assertHeaders(xml, false, true, false); - // TODO Correctly handle empty cells then enable this test - //assertContents(xml, true, false); + assertContents(xml, true, false); } @Test public void testXLSX() throws Exception { XMLResult result = getXML("test-columnar.xlsx"); String xml = result.xml; assertHeaders(xml, false, true, false); - // TODO Fix formatting in export then enable this test - //assertContents(xml, true, false); + assertContents(xml, true, false); + } + @Test + public void testXLSB() throws Exception { + XMLResult result = getXML("test-columnar.xlsb"); + String xml = result.xml; + assertHeaders(xml, false, true, false); + assertContents(xml, true, false); } - // Get a test XLSB file, then enable this unit test -// @Test -// public void testXLSB() throws Exception { -// XMLResult result = getXML("test-columnar.xlsb"); -// String xml = result.xml; -// assertHeaders(xml, false, true, false); -// assertContents(xml, true, false); -// } // TODO Fix the ODS test - currently failing with // org.xml.sax.SAXException: Namespace http://www.w3.org/1999/xhtml not declared diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index 732c11c..6304402 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -20,7 +20,6 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import java.io.File; import java.io.InputStream; import java.text.DecimalFormatSymbols; import java.util.List; @@ -78,6 +77,30 @@ public class ExcelParserTest extends TikaTest { assertNotContained("9.0", content); assertContains("196", content); assertNotContained("196.0", content); + + + // Won't include missing rows by default + assertContains("Numbers and their Squares\n\t\tNumber", content); + assertContains("\tSquare\n\t\t1", content); + } + + // Request with missing rows + try (InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL.xls")) { + OfficeParserConfig config = new OfficeParserConfig(); + config.setIncludeMissingRows(true); + + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + context.set(OfficeParserConfig.class, config); + new OfficeParser().parse(input, handler, metadata, context); + + // Will now have the missing rows, each with a single empty cell + String content = handler.toString(); + assertContains("Numbers and their Squares\n\t\n\t\n\t\tNumber", content); + assertContains("\tSquare\n\t\n\t\t1", content); } } -- To stop receiving notification emails like this one, please contact n...@apache.org.