This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 348b87e7f41b79ff115e17d9c91d2dad63a57c15
Author: Nick Burch <n...@gagravarr.org>
AuthorDate: Fri May 18 15:15:32 2018 +0100

    TIKA-2479 Update XLS missing cell/row handling to match XLSX and XLSB, add 
unit test for missing rows, and enable the Columnar tests for the Excel formats
---
 .../tika/parser/microsoft/ExcelExtractor.java      | 26 ++++++------
 .../org/apache/tika/parser/TabularFormatsTest.java | 47 ++++++++++------------
 .../tika/parser/microsoft/ExcelParserTest.java     | 25 +++++++++++-
 3 files changed, 60 insertions(+), 38 deletions(-)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 0dc33ee..ff5971a 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -16,7 +16,7 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import java.awt.*;
+import java.awt.Point;
 import java.io.IOException;
 import java.text.NumberFormat;
 import java.util.ArrayList;
@@ -42,7 +42,6 @@ import org.apache.poi.hssf.record.CountryRecord;
 import org.apache.poi.hssf.record.DateWindow1904Record;
 import org.apache.poi.hssf.record.DrawingGroupRecord;
 import org.apache.poi.hssf.record.EOFRecord;
-import org.apache.poi.hssf.record.ExtSSTRecord;
 import org.apache.poi.hssf.record.ExtendedFormatRecord;
 import org.apache.poi.hssf.record.FooterRecord;
 import org.apache.poi.hssf.record.FormatRecord;
@@ -281,7 +280,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
 
         public void processFile(DirectoryNode root, boolean 
listenForAllRecords)
                 throws IOException, SAXException, TikaException {
-
             // Set up listener and register the records we want to process
             HSSFRequest hssfRequest = new HSSFRequest();
             if (listenForAllRecords) {
@@ -494,15 +492,14 @@ public class ExcelExtractor extends 
AbstractPOIFSExtractor {
                         HeaderRecord headerRecord = (HeaderRecord) record;
                         addTextCell(record, headerRecord.getText());
                     }
-                       break;
+                    break;
                        
                 case FooterRecord.sid:
                     if 
(extractor.officeParserConfig.getIncludeHeadersAndFooters()) {
                         FooterRecord footerRecord = (FooterRecord) record;
                         addTextCell(record, footerRecord.getText());
                     }
-                       break;
-
+                    break;
             }
 
             previousSid = record.getSid();
@@ -599,12 +596,17 @@ public class ExcelExtractor extends 
AbstractPOIFSExtractor {
             handler.startElement("tr");
             handler.startElement("td");
             for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
-                while (currentRow < entry.getKey().y) {
-                    handler.endElement("td");
-                    handler.endElement("tr");
-                    handler.startElement("tr");
-                    handler.startElement("td");
-                    currentRow++;
+                if (currentRow != entry.getKey().y) {
+                    // We've moved onto a new row, possibly skipping some
+                    do {
+                        handler.endElement("td");
+                        handler.endElement("tr");
+                        handler.startElement("tr");
+                        handler.startElement("td");
+                        currentRow++;
+                    } while (officeParserConfig.getIncludeMissingRows() &&
+                             currentRow < entry.getKey().y);
+                    currentRow = entry.getKey().y;
                     currentColumn = 0;
                 }
 
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
index 41139e2..4a52118 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
@@ -64,8 +64,8 @@ public class TabularFormatsTest extends TikaTest {
                 "87.5%","88.9%","90.0%"
         },
         new Pattern[] {
-                Pattern.compile("01-(01|JAN|Jan)-(60|1960)"),
-                Pattern.compile("02-01-1960"),
+                Pattern.compile("0?1-01-1960"),
+                Pattern.compile("0?2-01-1960"),
                 Pattern.compile("17-01-1960"),
                 Pattern.compile("22-03-1960"),
                 Pattern.compile("13-09-1960"),
@@ -77,17 +77,17 @@ public class TabularFormatsTest extends TikaTest {
                 Pattern.compile("19-05-1987"),
         },
         new Pattern[] {
-             Pattern.compile("01(JAN|Jan)(60|1960):00:00:01(.00)?"),
-             Pattern.compile("01(JAN|Jan)(60|1960):00:00:10(.00)?"),
-             Pattern.compile("01(JAN|Jan)(60|1960):00:01:40(.00)?"),
-             Pattern.compile("01(JAN|Jan)(60|1960):00:16:40(.00)?"),
-             Pattern.compile("01(JAN|Jan)(60|1960):02:46:40(.00)?"),
-             Pattern.compile("02(JAN|Jan)(60|1960):03:46:40(.00)?"),
-             Pattern.compile("12(JAN|Jan)(60|1960):13:46:40(.00)?"),
-             Pattern.compile("25(APR|Apr)(60|1960):17:46:40(.00)?"),
-             Pattern.compile("03(MAR|Mar)(63|1963):09:46:40(.00)?"),
-             Pattern.compile("09(SEP|Sep)(91|1991):01:46:40(.00)?"),
-             Pattern.compile("19(NOV|Nov)(76|2276):17:46:40(.00)?")
+             Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:00:01(.00)?"),
+             Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:00:10(.00)?"),
+             Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:01:40(.00)?"),
+             Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]00:16:40(.00)?"),
+             Pattern.compile("01(JAN|Jan)(60|1960)[:\\s]02:46:40(.00)?"),
+             Pattern.compile("02(JAN|Jan)(60|1960)[:\\s]03:46:40(.00)?"),
+             Pattern.compile("12(JAN|Jan)(60|1960)[:\\s]13:46:40(.00)?"),
+             Pattern.compile("25(APR|Apr)(60|1960)[:\\s]17:46:40(.00)?"),
+             Pattern.compile("03(MAR|Mar)(63|1963)[:\\s]09:46:40(.00)?"),
+             Pattern.compile("09(SEP|Sep)(91|1991)[:\\s]01:46:40(.00)?"),
+             Pattern.compile("19(NOV|Nov)(76|2276)[:\\s]17:46:40(.00)?")
         },
         new Pattern[] {
              Pattern.compile("0?0:00:01(.\\d\\d)?"),
@@ -226,25 +226,22 @@ public class TabularFormatsTest extends TikaTest {
         XMLResult result = getXML("test-columnar.xls");
         String xml = result.xml;
         assertHeaders(xml, false, true, false);
-        // TODO Correctly handle empty cells then enable this test
-        //assertContents(xml, true, false);
+        assertContents(xml, true, false);
     }
     @Test
     public void testXLSX() throws Exception {
         XMLResult result = getXML("test-columnar.xlsx");
         String xml = result.xml;
         assertHeaders(xml, false, true, false);
-        // TODO Fix formatting in export then enable this test
-        //assertContents(xml, true, false);
+        assertContents(xml, true, false);
+    }
+    @Test
+    public void testXLSB() throws Exception {
+        XMLResult result = getXML("test-columnar.xlsb");
+        String xml = result.xml;
+        assertHeaders(xml, false, true, false);
+        assertContents(xml, true, false);
     }
-    // Get a test XLSB file, then enable this unit test
-//    @Test
-//    public void testXLSB() throws Exception {
-//        XMLResult result = getXML("test-columnar.xlsb");
-//        String xml = result.xml;
-//        assertHeaders(xml, false, true, false);
-//        assertContents(xml, true, false);
-//    }
 
     // TODO Fix the ODS test - currently failing with
     // org.xml.sax.SAXException: Namespace http://www.w3.org/1999/xhtml not 
declared
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 732c11c..6304402 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -20,7 +20,6 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
-import java.io.File;
 import java.io.InputStream;
 import java.text.DecimalFormatSymbols;
 import java.util.List;
@@ -78,6 +77,30 @@ public class ExcelParserTest extends TikaTest {
             assertNotContained("9.0", content);
             assertContains("196", content);
             assertNotContained("196.0", content);
+
+
+            // Won't include missing rows by default
+            assertContains("Numbers and their Squares\n\t\tNumber", content);
+            assertContains("\tSquare\n\t\t1", content);
+        }
+
+        // Request with missing rows
+        try (InputStream input = ExcelParserTest.class.getResourceAsStream(
+                "/test-documents/testEXCEL.xls")) {
+            OfficeParserConfig config = new OfficeParserConfig();
+            config.setIncludeMissingRows(true);
+
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            context.set(OfficeParserConfig.class, config);
+            new OfficeParser().parse(input, handler, metadata, context);
+
+            // Will now have the missing rows, each with a single empty cell
+            String content = handler.toString();
+            assertContains("Numbers and their Squares\n\t\n\t\n\t\tNumber", 
content);
+            assertContains("\tSquare\n\t\n\t\t1", content);
         }
     }
 

-- 
To stop receiving notification emails like this one, please contact
n...@apache.org.

Reply via email to