Repository: tika Updated Branches: refs/heads/2.x e27526b84 -> f4bacf859
TIKA-2025 increase number of significant digits extracted in "general" format in xls/xlsx Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f4bacf85 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f4bacf85 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f4bacf85 Branch: refs/heads/2.x Commit: f4bacf859650abbe438d7e19d6c0abdcd72a5b34 Parents: e27526b Author: tballison <talli...@mitre.org> Authored: Fri Jul 22 08:52:36 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Fri Jul 22 08:52:36 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 3 + .../tika/parser/microsoft/ExcelExtractor.java | 34 ++++++- .../microsoft/TikaExcelDataFormatter.java | 41 +++++++++ .../microsoft/TikaExcelGeneralFormat.java | 90 +++++++++++++++++++ .../ooxml/XSSFExcelExtractorDecorator.java | 11 ++- .../tika/parser/microsoft/ExcelParserTest.java | 10 +++ .../parser/microsoft/ooxml/OOXMLParserTest.java | 10 +++ .../test-documents/testEXCEL_big_numbers.xls | Bin 0 -> 26112 bytes .../test-documents/testEXCEL_big_numbers.xlsx | Bin 0 -> 8396 bytes 9 files changed, 195 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 64e1f53..1911376 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,9 @@ Release 2.0 - ??? Release 1.14 - ??? + * Maintain more significant digits in cells of "General" format + in XLS and XLSX (TIKA-2025). + * Improve extraction of embedded documents for PPT, PPTX and XLSX (TIKA-2026). http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java index 87f395c..53d95c2 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java @@ -219,6 +219,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { * formatting within the extraction. */ private FormatTrackingHSSFListener formatListener; + private final TikaExcelDataFormatter tikaExcelDataFormatter; + /** * List of worksheet names. */ @@ -255,7 +257,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { this.handler = handler; this.extractor = extractor; this.format = NumberFormat.getInstance(locale); - this.formatListener = new FormatTrackingHSSFListener(this, locale); + this.formatListener = new TikaFormatTrackingHSSFListener(this, locale);new FormatTrackingHSSFListener(this, locale); + this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale); } /** @@ -614,6 +617,35 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { findPictures(escherRecord.getChildRecords()); } } + private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener { + //TIKA-2025 -- use this to preserve large numbers in "General" format + //against the MS spec. + final TikaExcelGeneralFormat generalFormat; + public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) { + super(childListener, locale); + generalFormat = new TikaExcelGeneralFormat(locale); + } + + @Override + public String formatNumberDateCell(CellValueRecordInterface cell) { + String formatString = this.getFormatString(cell); + if (formatString != null && ! formatString.equals("General")) { + return super.formatNumberDateCell(cell); + } + + double value; + if(cell instanceof NumberRecord) { + value = ((NumberRecord)cell).getValue(); + } else { + if(!(cell instanceof FormulaRecord)) { + throw new IllegalArgumentException("Unsupported CellValue Record passed in " + cell); + } + + value = ((FormulaRecord)cell).getValue(); + } + return generalFormat.format(value); + } + } } /** http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java new file mode 100644 index 0000000..7144d73 --- /dev/null +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.util.Locale; + +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.util.LocaleUtil; + +/** + * Overrides Excel's General format to include more + * significant digits than the MS Spec allows. + * See TIKA-2025. + */ +public class TikaExcelDataFormatter extends DataFormatter { + + public TikaExcelDataFormatter() { + this(LocaleUtil.getUserLocale()); + } + + public TikaExcelDataFormatter (Locale locale) { + super(locale); + addFormat("General", new TikaExcelGeneralFormat(locale)); + addFormat("general", new TikaExcelGeneralFormat(locale)); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java new file mode 100644 index 0000000..6ddc0e0 --- /dev/null +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java @@ -0,0 +1,90 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + + +import java.math.BigDecimal; +import java.math.MathContext; +import java.math.RoundingMode; +import java.text.DecimalFormat; +import java.text.DecimalFormatSymbols; +import java.text.FieldPosition; +import java.text.Format; +import java.text.ParsePosition; +import java.util.Locale; + +import org.apache.poi.ss.usermodel.DataFormatter; + +/** + * A Format that allows up to 15 significant digits for integers. + * This goes against the Microsoft spec, but it preserves information + * for long strings of digits. + * <p> + * This was derived from POI's ExcelGeneralNumberFormat + */ +public class TikaExcelGeneralFormat extends Format { + + private static final long serialVersionUID = 1L; + + private static final MathContext TO_15_SF = new MathContext(15, RoundingMode.HALF_UP); + + private final DecimalFormatSymbols decimalSymbols; + private final DecimalFormat integerFormat; + private final DecimalFormat decimalFormat; + private final DecimalFormat scientificFormat; + + public TikaExcelGeneralFormat(final Locale locale) { + decimalSymbols = DecimalFormatSymbols.getInstance(locale); + scientificFormat = new DecimalFormat("0.##############E0", decimalSymbols); + DataFormatter.setExcelStyleRoundingMode(scientificFormat); + integerFormat = new DecimalFormat("#", decimalSymbols); + DataFormatter.setExcelStyleRoundingMode(integerFormat); + decimalFormat = new DecimalFormat("#.##########", decimalSymbols); + DataFormatter.setExcelStyleRoundingMode(decimalFormat); + } + + public StringBuffer format(Object number, StringBuffer toAppendTo, FieldPosition pos) { + final double value; + if (number instanceof Number) { + value = ((Number) number).doubleValue(); + if (Double.isInfinite(value) || Double.isNaN(value)) { + return integerFormat.format(number, toAppendTo, pos); + } + } else { + // testBug54786 gets here with a date, so retain previous behaviour + return integerFormat.format(number, toAppendTo, pos); + } + final double abs = Math.abs(value); + if (abs > 1E15 || (abs <= 1E-15 && abs > 0)) { + return scientificFormat.format(number, toAppendTo, pos); + } else if (Math.floor(value) == value || abs > 1E15) { + // integer, or integer portion uses all 15 allowed digits + return integerFormat.format(number, toAppendTo, pos); + } + // Non-integers of non-scientific magnitude are formatted as "up to 11 + // numeric characters, with the decimal point counting as a numeric + // character". We know there is a decimal point, so limit to 10 digits. + // https://support.microsoft.com/en-us/kb/65903 + final double rounded = new BigDecimal(value).round(TO_15_SF).doubleValue(); + return decimalFormat.format(rounded, toAppendTo, pos); + } + + public Object parseObject(String source, ParsePosition pos) { + throw new UnsupportedOperationException(); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index 79ec3c4..d375dd9 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -18,7 +18,11 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; import java.io.InputStream; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; @@ -47,6 +51,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.TikaExcelDataFormatter; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.xmlbeans.XmlException; import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink; @@ -83,9 +88,9 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { extractor.setLocale(locale); if (locale == null) { - formatter = new DataFormatter(); + formatter = new TikaExcelDataFormatter(); } else { - formatter = new DataFormatter(locale); + formatter = new TikaExcelDataFormatter(locale); } } http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index 109ad9c..6f411f5 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -409,4 +409,14 @@ public class ExcelParserTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_embeddedPDF.xls"); assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE)); } + + @Test + public void testBigIntegersWGeneralFormat() throws Exception { + //TIKA-2025 + String xml = getXML("testEXCEL_big_numbers.xls").xml; + assertContains("123456789012345", xml);//15 digit number + assertContains("123456789012346", xml);//15 digit formula + assertContains("1.23456789012345E15", xml);//16 digit number is treated as scientific notation + assertContains("1.23456789012345E15", xml);//16 digit formula, ditto + } } http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 0a29ab2..8c1ee39 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -1236,6 +1236,16 @@ public class OOXMLParserTest extends TikaTest { assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip", Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME))); } + + @Test + public void testBigIntegersWGeneralFormat() throws Exception { + //TIKA-2025 + String xml = getXML("testEXCEL_big_numbers.xlsx").xml; + assertContains("123456789012345", xml);//15 digit number + assertContains("123456789012346", xml);//15 digit formula + assertContains("1.23456789012345E+15", xml);//16 digit number is treated as scientific notation + assertContains("1.23456789012345E+15", xml);//16 digit formula, ditto + } } http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xls ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xls b/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xls new file mode 100644 index 0000000..c5c10f0 Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xls differ http://git-wip-us.apache.org/repos/asf/tika/blob/f4bacf85/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx b/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx new file mode 100644 index 0000000..ce5dd8e Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx differ