Repository: tika Updated Branches: refs/heads/master 952fb54ed -> a383567c2
TIKA-2025 -- override general format in excel to extract 15 digit integers Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a383567c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a383567c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a383567c Branch: refs/heads/master Commit: a383567c2c947603c4c7aa12d3578d771bb58413 Parents: 952fb54 Author: tballison <[email protected]> Authored: Fri Jul 22 08:48:11 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Jul 22 08:48:11 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 3 + .../tika/parser/microsoft/ExcelExtractor.java | 34 ++++++- .../microsoft/TikaExcelDataFormatter.java | 41 +++++++++ .../microsoft/TikaExcelGeneralFormat.java | 90 +++++++++++++++++++ .../ooxml/XSSFExcelExtractorDecorator.java | 20 +++-- .../tika/parser/microsoft/ExcelParserTest.java | 10 +++ .../parser/microsoft/ooxml/OOXMLParserTest.java | 10 +++ .../test-documents/testEXCEL_big_numbers.xls | Bin 0 -> 26112 bytes .../test-documents/testEXCEL_big_numbers.xlsx | Bin 0 -> 8396 bytes 9 files changed, 201 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 6ba831f..924a5b1 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,8 @@ Release 1.14 - ??? + * Maintain more significant digits in cells of "General" format + in XLS and XLSX (TIKA-2025). + * Avoid mark/reset issues when extracting or detecting embedded resources in RFC822 emails (TIKA-2037). http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java index 87f395c..1f336d8 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java @@ -219,6 +219,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { * formatting within the extraction. */ private FormatTrackingHSSFListener formatListener; + private final TikaExcelDataFormatter tikaExcelDataFormatter; + /** * List of worksheet names. */ @@ -255,7 +257,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { this.handler = handler; this.extractor = extractor; this.format = NumberFormat.getInstance(locale); - this.formatListener = new FormatTrackingHSSFListener(this, locale); + this.formatListener = new TikaFormatTrackingHSSFListener(this, locale); + this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale); } /** @@ -614,6 +617,35 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { findPictures(escherRecord.getChildRecords()); } } + private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener { + //TIKA-2025 -- use this to preserve large numbers in "General" format + //against the MS spec. + final TikaExcelGeneralFormat generalFormat; + public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) { + super(childListener, locale); + generalFormat = new TikaExcelGeneralFormat(locale); + } + + @Override + public String formatNumberDateCell(CellValueRecordInterface cell) { + String formatString = this.getFormatString(cell); + if (formatString != null && ! formatString.equals("General")) { + return super.formatNumberDateCell(cell); + } + + double value; + if(cell instanceof NumberRecord) { + value = ((NumberRecord)cell).getValue(); + } else { + if(!(cell instanceof FormulaRecord)) { + throw new IllegalArgumentException("Unsupported CellValue Record passed in " + cell); + } + + value = ((FormulaRecord)cell).getValue(); + } + return generalFormat.format(value); + } + } } /** http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java new file mode 100644 index 0000000..7144d73 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelDataFormatter.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import java.util.Locale; + +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.util.LocaleUtil; + +/** + * Overrides Excel's General format to include more + * significant digits than the MS Spec allows. + * See TIKA-2025. + */ +public class TikaExcelDataFormatter extends DataFormatter { + + public TikaExcelDataFormatter() { + this(LocaleUtil.getUserLocale()); + } + + public TikaExcelDataFormatter (Locale locale) { + super(locale); + addFormat("General", new TikaExcelGeneralFormat(locale)); + addFormat("general", new TikaExcelGeneralFormat(locale)); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java new file mode 100644 index 0000000..6ddc0e0 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TikaExcelGeneralFormat.java @@ -0,0 +1,90 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + + +import java.math.BigDecimal; +import java.math.MathContext; +import java.math.RoundingMode; +import java.text.DecimalFormat; +import java.text.DecimalFormatSymbols; +import java.text.FieldPosition; +import java.text.Format; +import java.text.ParsePosition; +import java.util.Locale; + +import org.apache.poi.ss.usermodel.DataFormatter; + +/** + * A Format that allows up to 15 significant digits for integers. + * This goes against the Microsoft spec, but it preserves information + * for long strings of digits. + * <p> + * This was derived from POI's ExcelGeneralNumberFormat + */ +public class TikaExcelGeneralFormat extends Format { + + private static final long serialVersionUID = 1L; + + private static final MathContext TO_15_SF = new MathContext(15, RoundingMode.HALF_UP); + + private final DecimalFormatSymbols decimalSymbols; + private final DecimalFormat integerFormat; + private final DecimalFormat decimalFormat; + private final DecimalFormat scientificFormat; + + public TikaExcelGeneralFormat(final Locale locale) { + decimalSymbols = DecimalFormatSymbols.getInstance(locale); + scientificFormat = new DecimalFormat("0.##############E0", decimalSymbols); + DataFormatter.setExcelStyleRoundingMode(scientificFormat); + integerFormat = new DecimalFormat("#", decimalSymbols); + DataFormatter.setExcelStyleRoundingMode(integerFormat); + decimalFormat = new DecimalFormat("#.##########", decimalSymbols); + DataFormatter.setExcelStyleRoundingMode(decimalFormat); + } + + public StringBuffer format(Object number, StringBuffer toAppendTo, FieldPosition pos) { + final double value; + if (number instanceof Number) { + value = ((Number) number).doubleValue(); + if (Double.isInfinite(value) || Double.isNaN(value)) { + return integerFormat.format(number, toAppendTo, pos); + } + } else { + // testBug54786 gets here with a date, so retain previous behaviour + return integerFormat.format(number, toAppendTo, pos); + } + final double abs = Math.abs(value); + if (abs > 1E15 || (abs <= 1E-15 && abs > 0)) { + return scientificFormat.format(number, toAppendTo, pos); + } else if (Math.floor(value) == value || abs > 1E15) { + // integer, or integer portion uses all 15 allowed digits + return integerFormat.format(number, toAppendTo, pos); + } + // Non-integers of non-scientific magnitude are formatted as "up to 11 + // numeric characters, with the decimal point counting as a numeric + // character". We know there is a decimal point, so limit to 10 digits. + // https://support.microsoft.com/en-us/kb/65903 + final double rounded = new BigDecimal(value).round(TO_15_SF).doubleValue(); + return decimalFormat.format(rounded, toAppendTo, pos); + } + + public Object parseObject(String source, ParsePosition pos) { + throw new UnsupportedOperationException(); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index a252793..ae8b6cb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -18,13 +18,21 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; import java.io.InputStream; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; -import com.microsoft.schemas.vml.CTH; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; -import org.apache.poi.openxml4j.opc.*; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackagePartName; +import org.apache.poi.openxml4j.opc.PackageRelationship; +import org.apache.poi.openxml4j.opc.PackagingURIHelper; +import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.HeaderFooter; import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; @@ -43,9 +51,9 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.TikaExcelDataFormatter; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.xmlbeans.XmlException; -import org.bouncycastle.util.Pack; import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink; import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps; import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape; @@ -79,9 +87,9 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { extractor.setLocale(locale); if (locale == null) { - formatter = new DataFormatter(); + formatter = new TikaExcelDataFormatter(); } else { - formatter = new DataFormatter(locale); + formatter = new TikaExcelDataFormatter(locale); } } http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index fe8edd2..cb93b55 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -462,4 +462,14 @@ public class ExcelParserTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata("testExcel_embeddedPDF.xls"); assertContains("Hello World!", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT)); } + + @Test + public void testBigIntegersWGeneralFormat() throws Exception { + //TIKA-2025 + String xml = getXML("testEXCEL_big_numbers.xls").xml; + assertContains("123456789012345", xml);//15 digit number + assertContains("123456789012346", xml);//15 digit formula + assertContains("1.23456789012345E15", xml);//16 digit number is treated as scientific notation + assertContains("1.23456789012345E15", xml);//16 digit formula, ditto + } } http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index bc2b0ae..8625fa3 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -1237,6 +1237,16 @@ public class OOXMLParserTest extends TikaTest { assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip", Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME))); } + + @Test + public void testBigIntegersWGeneralFormat() throws Exception { + //TIKA-2025 + String xml = getXML("testEXCEL_big_numbers.xlsx").xml; + assertContains("123456789012345", xml);//15 digit number + assertContains("123456789012346", xml);//15 digit formula + assertContains("1.23456789012345E+15", xml);//16 digit number is treated as scientific notation + assertContains("1.23456789012345E+15", xml);//16 digit formula, ditto + } } http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/tika-parsers/src/test/resources/test-documents/testEXCEL_big_numbers.xls ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_big_numbers.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_big_numbers.xls new file mode 100644 index 0000000..c5c10f0 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_big_numbers.xls differ http://git-wip-us.apache.org/repos/asf/tika/blob/a383567c/tika-parsers/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx new file mode 100644 index 0000000..ce5dd8e Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_big_numbers.xlsx differ
