This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 4e03f90  TIKA-2438 -- ooxml locale should be set via POI's LocaleUtil. 
 Fix unit tests to be robust in different locales.  Many thanks to Karl Richter 
for raising this issue.
4e03f90 is described below

commit 4e03f90cf0a20375819f33371ec094386a744f36
Author: tballison <[email protected]>
AuthorDate: Wed Aug 2 07:32:15 2017 -0400

    TIKA-2438 -- ooxml locale should be set via POI's LocaleUtil.  Fix unit
    tests to be robust in different locales.  Many thanks to Karl Richter for
    raising this issue.
---
 CHANGES.txt                                        |   4 +
 .../apache/tika/parser/microsoft/OfficeParser.java |   3 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |   3 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    | 111 +++++++++++++--------
 4 files changed, 77 insertions(+), 44 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index a32bc5f..9ecb916 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
 Release 1.17 - ???
 
+  * OOXML locale should be set by POI's LocaleUtil not Locale.getDefault().
+    Fix unit tests to be robust against different locales in OOXML
+    and ExcelParser (TIKA-2438).
+
   * Upgrade to PDFBox 2.0.7 (TIKA-2431).
 
   * Tika now has support for automatic image captioning, that
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 1dc9cc5..0375156 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -40,6 +40,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.poifs.macros.VBAMacroReader;
 import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LocaleUtil;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -179,7 +180,7 @@ public class OfficeParser extends AbstractOfficeParser {
                 break;
             case WORKBOOK:
             case XLR:
-                Locale locale = context.get(Locale.class, Locale.getDefault());
+                Locale locale = context.get(Locale.class, 
LocaleUtil.getUserLocale());
                 new ExcelExtractor(context, metadata).parse(root, xhtml, 
locale);
                 break;
             case PROJECT:
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 63fd60f..b6f7bf5 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -30,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.util.LocaleUtil;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
@@ -61,7 +62,7 @@ public class OOXMLExtractorFactory {
             InputStream stream, ContentHandler baseHandler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        Locale locale = context.get(Locale.class, Locale.getDefault());
+        Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
         ExtractorFactory.setThreadPrefersEventExtractors(true);
 
         try {
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index c512157..54fdb5b 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1348,7 +1348,28 @@ public class OOXMLParserTest extends TikaTest {
         DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
         //16 digit number is treated as scientific notation as is the 16 digit 
formula
         
assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E+15</td>\t"+
-                "<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15", 
xml);    }
+                "<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15", 
xml);
+    }
+
+    @Test
+    public void testBigIntegersWGeneralFormatWLocaleIT() throws Exception {
+        LocaleUtil.setUserLocale(Locale.ITALIAN);
+        //TIKA-2438
+        try {
+            String xml = getXML("testEXCEL_big_numbers.xlsx").xml;
+            assertContains("123456789012345", xml);//15 digit number
+            assertContains("123456789012346", xml);//15 digit formula
+            Locale locale = LocaleUtil.getUserLocale();
+
+            DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
+            //16 digit number is treated as scientific notation as is the 16 
digit formula
+            assertContains("1" + symbols.getDecimalSeparator() + 
"23456789012345E+15</td>\t" +
+                    "<td>1" + symbols.getDecimalSeparator() + 
"23456789012345E+15", xml);
+        } finally {
+            LocaleUtil.setUserLocale(USER_LOCALE);
+        }
+    }
+
 
     @Test
     public void testBoldHyperlink() throws Exception {
@@ -1540,47 +1561,53 @@ public class OOXMLParserTest extends TikaTest {
 
     @Test
     public void testXLSBVarious() throws Exception {
-        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
-        officeParserConfig.setExtractMacros(true);
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(OfficeParserConfig.class, officeParserConfig);
-        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
-        assertEquals(4, metadataList.size());
-
-        String xml = 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
-        assertContains("<td>13</td>", xml);
-        assertContains("<td>13.1211231321</td>", xml);
-        assertContains("<td>$   3.03</td>", xml);
-        assertContains("<td>20%</td>", xml);
-        assertContains("<td>13.12</td>", xml);
-        assertContains("<td>123456789012345</td>", xml);
-        assertContains("<td>1.23456789012345E+15</td>", xml);
-        assertContains("test comment2", xml);
-
-        assertContains("comment4 (end of row)", xml);
-
-
-        assertContains("<td>1/4</td>", xml);
-        assertContains("<td>3/9/17</td>", xml);
-        assertContains("<td>4</td>", xml);
-        assertContains("<td>2</td>", xml);
-
-        assertContains("<td>   46/1963</td>", xml);
-        assertContains("<td>  3/128</td>", xml);
-        assertContains("test textbox", xml);
-
-        assertContains("test WordArt", xml);
-
-        assertContains("<a 
href=\"http://lucene.apache.org/\";>http://lucene.apache.org/</a>", xml);
-        assertContains("<a 
href=\"http://tika.apache.org/\";>http://tika.apache.org/</a>", xml);
-
-        assertContains("OddLeftHeader OddCenterHeader OddRightHeader", xml);
-        assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml);
-
-        assertContains("FirstPageLeftHeader FirstPageCenterHeader 
FirstPageRightHeader", xml);
-        assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
-        assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
-        assertContains("FirstPageLeftFooter FirstPageCenterFooter 
FirstPageRightFooter", xml);
+        try {
+            LocaleUtil.setUserLocale(Locale.US);
+            //have to set to US because of a bug in POI for $   3.03 in 
Locale.ITALIAN
+            OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+            officeParserConfig.setExtractMacros(true);
+            ParseContext parseContext = new ParseContext();
+            parseContext.set(OfficeParserConfig.class, officeParserConfig);
+            List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
+            assertEquals(4, metadataList.size());
+
+            String xml = 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+            assertContains("<td>13</td>", xml);
+            assertContains("<td>13.1211231321</td>", xml);
+            assertContains("<td>$   3.03</td>", xml);
+            assertContains("<td>20%</td>", xml);
+            assertContains("<td>13.12</td>", xml);
+            assertContains("<td>123456789012345</td>", xml);
+            assertContains("<td>1.23456789012345E+15</td>", xml);
+            assertContains("test comment2", xml);
+
+            assertContains("comment4 (end of row)", xml);
+
+
+            assertContains("<td>1/4</td>", xml);
+            assertContains("<td>3/9/17</td>", xml);
+            assertContains("<td>4</td>", xml);
+            assertContains("<td>2</td>", xml);
+
+            assertContains("<td>   46/1963</td>", xml);
+            assertContains("<td>  3/128</td>", xml);
+            assertContains("test textbox", xml);
+
+            assertContains("test WordArt", xml);
+
+            assertContains("<a 
href=\"http://lucene.apache.org/\";>http://lucene.apache.org/</a>", xml);
+            assertContains("<a 
href=\"http://tika.apache.org/\";>http://tika.apache.org/</a>", xml);
+
+            assertContains("OddLeftHeader OddCenterHeader OddRightHeader", 
xml);
+            assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", 
xml);
+
+            assertContains("FirstPageLeftHeader FirstPageCenterHeader 
FirstPageRightHeader", xml);
+            assertContains("OddLeftFooter OddCenterFooter OddRightFooter", 
xml);
+            assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", 
xml);
+            assertContains("FirstPageLeftFooter FirstPageCenterFooter 
FirstPageRightFooter", xml);
+        } finally {
+            LocaleUtil.setUserLocale(USER_LOCALE);
+        }
     }
 
     @Test

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to