Repository: tika
Updated Branches:
  refs/heads/2.x e27526b84 -> f4bacf859


TIKA-2025 increase number of significant digits extracted in "general" format 
in xls/xlsx


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f4bacf85
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f4bacf85
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f4bacf85

Branch: refs/heads/2.x
Commit: f4bacf859650abbe438d7e19d6c0abdcd72a5b34
Parents: e27526b
Author: tballison <talli...@mitre.org>
Authored: Fri Jul 22 08:52:36 2016 -0400
Committer: tballison <talli...@mitre.org>
Committed: Fri Jul 22 08:52:36 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../tika/parser/microsoft/ExcelExtractor.java   |  34 ++++++-
 .../microsoft/TikaExcelDataFormatter.java       |  41 +++++++++
 .../microsoft/TikaExcelGeneralFormat.java       |  90 +++++++++++++++++++
 .../ooxml/XSSFExcelExtractorDecorator.java      |  11 ++-
 .../tika/parser/microsoft/ExcelParserTest.java  |  10 +++
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  10 +++
 .../test-documents/testEXCEL_big_numbers.xls    | Bin 0 -> 26112 bytes
 .../test-documents/testEXCEL_big_numbers.xlsx   | Bin 0 -> 8396 bytes
 9 files changed, 195 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 64e1f53..1911376 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
+  * Maintain more significant digits in cells of "General" format
+    in XLS and XLSX (TIKA-2025).
+
   * Improve extraction of embedded documents for PPT, PPTX and XLSX
     (TIKA-2026).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 87f395c..53d95c2 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -219,6 +219,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
          * formatting within the extraction.
          */
         private FormatTrackingHSSFListener formatListener;
+        private final TikaExcelDataFormatter tikaExcelDataFormatter;
+
         /**
          * List of worksheet names.
          */
@@ -255,7 +257,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             this.handler = handler;
             this.extractor = extractor;
             this.format = NumberFormat.getInstance(locale);
-            this.formatListener = new FormatTrackingHSSFListener(this, locale);
+            this.formatListener = new TikaFormatTrackingHSSFListener(this, 
locale);new FormatTrackingHSSFListener(this, locale);
+            this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale);
         }
 
         /**
@@ -614,6 +617,35 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
                 findPictures(escherRecord.getChildRecords());
             }
         }
+        private class TikaFormatTrackingHSSFListener extends 
FormatTrackingHSSFListener {
+            //TIKA-2025 -- use this to preserve large numbers in "General" 
format
+            //against the MS spec.
+            final TikaExcelGeneralFormat generalFormat;
+            public TikaFormatTrackingHSSFListener(HSSFListener childListener, 
Locale locale) {
+                super(childListener, locale);
+                generalFormat = new TikaExcelGeneralFormat(locale);
+            }
+
+            @Override
+            public String formatNumberDateCell(CellValueRecordInterface cell) {
+                String formatString = this.getFormatString(cell);
+                if (formatString != null && ! formatString.equals("General")) {
+                    return super.formatNumberDateCell(cell);
+                }
+
+                double value;
+                if(cell instanceof NumberRecord) {
+                    value = ((NumberRecord)cell).getValue();
+                } else {
+                    if(!(cell instanceof FormulaRecord)) {
+                        throw new IllegalArgumentException("Unsupported 
CellValue Record passed in " + cell);
+                    }
+
+                    value = ((FormulaRecord)cell).getValue();
+                }
+                return generalFormat.format(value);
+            }
+        }
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
new file mode 100644
index 0000000..7144d73
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.Locale;
+
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.util.LocaleUtil;
+
+/**
+ * Overrides Excel's General format to include more
+ * significant digits than the MS Spec allows.
+ * See TIKA-2025.
+ */
+public class TikaExcelDataFormatter extends DataFormatter {
+
+    public TikaExcelDataFormatter() {
+        this(LocaleUtil.getUserLocale());
+    }
+
+    public TikaExcelDataFormatter (Locale locale) {
+        super(locale);
+        addFormat("General", new TikaExcelGeneralFormat(locale));
+        addFormat("general", new TikaExcelGeneralFormat(locale));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java
new file mode 100644
index 0000000..6ddc0e0
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+
+import java.math.BigDecimal;
+import java.math.MathContext;
+import java.math.RoundingMode;
+import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
+import java.text.FieldPosition;
+import java.text.Format;
+import java.text.ParsePosition;
+import java.util.Locale;
+
+import org.apache.poi.ss.usermodel.DataFormatter;
+
+/**
+ * A Format that allows up to 15 significant digits for integers.
+ * This goes against the Microsoft spec, but it preserves information
+ * for long strings of digits.
+ * <p>
+ * This was derived from POI's ExcelGeneralNumberFormat
+ */
+public class TikaExcelGeneralFormat extends Format {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final MathContext TO_15_SF = new MathContext(15, 
RoundingMode.HALF_UP);
+
+    private final DecimalFormatSymbols decimalSymbols;
+    private final DecimalFormat integerFormat;
+    private final DecimalFormat decimalFormat;
+    private final DecimalFormat scientificFormat;
+
+    public TikaExcelGeneralFormat(final Locale locale) {
+        decimalSymbols = DecimalFormatSymbols.getInstance(locale);
+        scientificFormat = new DecimalFormat("0.##############E0", 
decimalSymbols);
+        DataFormatter.setExcelStyleRoundingMode(scientificFormat);
+        integerFormat = new DecimalFormat("#", decimalSymbols);
+        DataFormatter.setExcelStyleRoundingMode(integerFormat);
+        decimalFormat = new DecimalFormat("#.##########", decimalSymbols);
+        DataFormatter.setExcelStyleRoundingMode(decimalFormat);
+    }
+
+    public StringBuffer format(Object number, StringBuffer toAppendTo, 
FieldPosition pos) {
+        final double value;
+        if (number instanceof Number) {
+            value = ((Number) number).doubleValue();
+            if (Double.isInfinite(value) || Double.isNaN(value)) {
+                return integerFormat.format(number, toAppendTo, pos);
+            }
+        } else {
+            // testBug54786 gets here with a date, so retain previous behaviour
+            return integerFormat.format(number, toAppendTo, pos);
+        }
+        final double abs = Math.abs(value);
+        if (abs > 1E15 || (abs <= 1E-15 && abs > 0)) {
+            return scientificFormat.format(number, toAppendTo, pos);
+        } else if (Math.floor(value) == value || abs > 1E15) {
+            // integer, or integer portion uses all 15 allowed digits
+            return integerFormat.format(number, toAppendTo, pos);
+        }
+        // Non-integers of non-scientific magnitude are formatted as "up to 11
+        // numeric characters, with the decimal point counting as a numeric
+        // character". We know there is a decimal point, so limit to 10 digits.
+        // https://support.microsoft.com/en-us/kb/65903
+        final double rounded = new 
BigDecimal(value).round(TO_15_SF).doubleValue();
+        return decimalFormat.format(rounded, toAppendTo, pos);
+    }
+
+    public Object parseObject(String source, ParsePosition pos) {
+        throw new UnsupportedOperationException();
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 79ec3c4..d375dd9 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -18,7 +18,11 @@ package org.apache.tika.parser.microsoft.ooxml;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
 
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -47,6 +51,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
@@ -83,9 +88,9 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
         extractor.setLocale(locale);
 
         if (locale == null) {
-            formatter = new DataFormatter();
+            formatter = new TikaExcelDataFormatter();
         } else {
-            formatter = new DataFormatter(locale);
+            formatter = new TikaExcelDataFormatter(locale);
         }
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 109ad9c..6f411f5 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -409,4 +409,14 @@ public class ExcelParserTest extends TikaTest {
         List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_embeddedPDF.xls");
         assertEquals("application/pdf", 
metadataList.get(2).get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testBigIntegersWGeneralFormat() throws Exception {
+        //TIKA-2025
+        String xml = getXML("testEXCEL_big_numbers.xls").xml;
+        assertContains("123456789012345", xml);//15 digit number
+        assertContains("123456789012346", xml);//15 digit formula
+        assertContains("1.23456789012345E15", xml);//16 digit number is 
treated as scientific notation
+        assertContains("1.23456789012345E15", xml);//16 digit formula, ditto
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 0a29ab2..8c1ee39 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1236,6 +1236,16 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder 
(2)\\embed1.zip",
                 
Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
     }
+
+    @Test
+    public void testBigIntegersWGeneralFormat() throws Exception {
+        //TIKA-2025
+        String xml = getXML("testEXCEL_big_numbers.xlsx").xml;
+        assertContains("123456789012345", xml);//15 digit number
+        assertContains("123456789012346", xml);//15 digit formula
+        assertContains("1.23456789012345E+15", xml);//16 digit number is 
treated as scientific notation
+        assertContains("1.23456789012345E+15", xml);//16 digit formula, ditto
+    }
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xls
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xls
 
b/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xls
new file mode 100644
index 0000000..c5c10f0
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xls
 differ

http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx
 
b/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx
new file mode 100644
index 0000000..ce5dd8e
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx
 differ

Reply via email to