This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 4e03f90 TIKA-2438 -- ooxml locale should be set via POI's LocaleUtil.
Fix unit tests to be robust in different locales. Many thanks to Karl Richter
for raising this issue.
4e03f90 is described below
commit 4e03f90cf0a20375819f33371ec094386a744f36
Author: tballison <[email protected]>
AuthorDate: Wed Aug 2 07:32:15 2017 -0400
TIKA-2438 -- ooxml locale should be set via POI's LocaleUtil. Fix unit
tests to be robust in different locales. Many thanks to Karl Richter for
raising this issue.
---
CHANGES.txt | 4 +
.../apache/tika/parser/microsoft/OfficeParser.java | 3 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 3 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 111 +++++++++++++--------
4 files changed, 77 insertions(+), 44 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index a32bc5f..9ecb916 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
Release 1.17 - ???
+ * OOXML locale should be set by POI's LocaleUtil not Locale.getDefault().
+ Fix unit tests to be robust against different locales in OOXML
+ and ExcelParser (TIKA-2438).
+
* Upgrade to PDFBox 2.0.7 (TIKA-2431).
* Tika now has support for automatic image captioning, that
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 1dc9cc5..0375156 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -40,6 +40,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LocaleUtil;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -179,7 +180,7 @@ public class OfficeParser extends AbstractOfficeParser {
break;
case WORKBOOK:
case XLR:
- Locale locale = context.get(Locale.class, Locale.getDefault());
+ Locale locale = context.get(Locale.class,
LocaleUtil.getUserLocale());
new ExcelExtractor(context, metadata).parse(root, xhtml,
locale);
break;
case PROJECT:
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 63fd60f..b6f7bf5 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -30,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.util.LocaleUtil;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
@@ -61,7 +62,7 @@ public class OOXMLExtractorFactory {
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- Locale locale = context.get(Locale.class, Locale.getDefault());
+ Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index c512157..54fdb5b 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1348,7 +1348,28 @@ public class OOXMLParserTest extends TikaTest {
DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
//16 digit number is treated as scientific notation as is the 16 digit
formula
assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E+15</td>\t"+
- "<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15",
xml); }
+ "<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15",
xml);
+ }
+
+ @Test
+ public void testBigIntegersWGeneralFormatWLocaleIT() throws Exception {
+ LocaleUtil.setUserLocale(Locale.ITALIAN);
+ //TIKA-2438
+ try {
+ String xml = getXML("testEXCEL_big_numbers.xlsx").xml;
+ assertContains("123456789012345", xml);//15 digit number
+ assertContains("123456789012346", xml);//15 digit formula
+ Locale locale = LocaleUtil.getUserLocale();
+
+ DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
+ //16 digit number is treated as scientific notation as is the 16
digit formula
+ assertContains("1" + symbols.getDecimalSeparator() +
"23456789012345E+15</td>\t" +
+ "<td>1" + symbols.getDecimalSeparator() +
"23456789012345E+15", xml);
+ } finally {
+ LocaleUtil.setUserLocale(USER_LOCALE);
+ }
+ }
+
@Test
public void testBoldHyperlink() throws Exception {
@@ -1540,47 +1561,53 @@ public class OOXMLParserTest extends TikaTest {
@Test
public void testXLSBVarious() throws Exception {
- OfficeParserConfig officeParserConfig = new OfficeParserConfig();
- officeParserConfig.setExtractMacros(true);
- ParseContext parseContext = new ParseContext();
- parseContext.set(OfficeParserConfig.class, officeParserConfig);
- List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
- assertEquals(4, metadataList.size());
-
- String xml =
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
- assertContains("<td>13</td>", xml);
- assertContains("<td>13.1211231321</td>", xml);
- assertContains("<td>$ 3.03</td>", xml);
- assertContains("<td>20%</td>", xml);
- assertContains("<td>13.12</td>", xml);
- assertContains("<td>123456789012345</td>", xml);
- assertContains("<td>1.23456789012345E+15</td>", xml);
- assertContains("test comment2", xml);
-
- assertContains("comment4 (end of row)", xml);
-
-
- assertContains("<td>1/4</td>", xml);
- assertContains("<td>3/9/17</td>", xml);
- assertContains("<td>4</td>", xml);
- assertContains("<td>2</td>", xml);
-
- assertContains("<td> 46/1963</td>", xml);
- assertContains("<td> 3/128</td>", xml);
- assertContains("test textbox", xml);
-
- assertContains("test WordArt", xml);
-
- assertContains("<a
href=\"http://lucene.apache.org/\">http://lucene.apache.org/</a>", xml);
- assertContains("<a
href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", xml);
-
- assertContains("OddLeftHeader OddCenterHeader OddRightHeader", xml);
- assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml);
-
- assertContains("FirstPageLeftHeader FirstPageCenterHeader
FirstPageRightHeader", xml);
- assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
- assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
- assertContains("FirstPageLeftFooter FirstPageCenterFooter
FirstPageRightFooter", xml);
+ try {
+ LocaleUtil.setUserLocale(Locale.US);
+ //have to set to US because of a bug in POI for $ 3.03 in
Locale.ITALIAN
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
+ assertEquals(4, metadataList.size());
+
+ String xml =
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertContains("<td>13</td>", xml);
+ assertContains("<td>13.1211231321</td>", xml);
+ assertContains("<td>$ 3.03</td>", xml);
+ assertContains("<td>20%</td>", xml);
+ assertContains("<td>13.12</td>", xml);
+ assertContains("<td>123456789012345</td>", xml);
+ assertContains("<td>1.23456789012345E+15</td>", xml);
+ assertContains("test comment2", xml);
+
+ assertContains("comment4 (end of row)", xml);
+
+
+ assertContains("<td>1/4</td>", xml);
+ assertContains("<td>3/9/17</td>", xml);
+ assertContains("<td>4</td>", xml);
+ assertContains("<td>2</td>", xml);
+
+ assertContains("<td> 46/1963</td>", xml);
+ assertContains("<td> 3/128</td>", xml);
+ assertContains("test textbox", xml);
+
+ assertContains("test WordArt", xml);
+
+ assertContains("<a
href=\"http://lucene.apache.org/\">http://lucene.apache.org/</a>", xml);
+ assertContains("<a
href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", xml);
+
+ assertContains("OddLeftHeader OddCenterHeader OddRightHeader",
xml);
+ assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader",
xml);
+
+ assertContains("FirstPageLeftHeader FirstPageCenterHeader
FirstPageRightHeader", xml);
+ assertContains("OddLeftFooter OddCenterFooter OddRightFooter",
xml);
+ assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter",
xml);
+ assertContains("FirstPageLeftFooter FirstPageCenterFooter
FirstPageRightFooter", xml);
+ } finally {
+ LocaleUtil.setUserLocale(USER_LOCALE);
+ }
}
@Test
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].