Repository: tika
Updated Branches:
  refs/heads/2.x 9f6241161 -> 176f3aded


TIKA 2055 catch exception when totalTime out of unsigned int range in ooxml


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ae0cb305
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ae0cb305
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ae0cb305

Branch: refs/heads/2.x
Commit: ae0cb3059a9ae8e21685c344ad6e8a123039f272
Parents: 9f62411
Author: tballison <[email protected]>
Authored: Thu Sep 15 11:57:11 2016 -0400
Committer: tballison <[email protected]>
Committed: Thu Sep 15 11:57:11 2016 -0400

----------------------------------------------------------------------
 .../parser/microsoft/ooxml/MetadataExtractor.java    | 15 +++++++++++++--
 .../tika/parser/microsoft/ooxml/OOXMLParserTest.java |  7 +++++++
 2 files changed, 20 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ae0cb305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 25d3596..91d49c7 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
 import 
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
 import 
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
 
@@ -113,6 +114,16 @@ public class MetadataExtractor {
                                  Metadata metadata) {
         CTProperties propsHolder = properties.getUnderlyingProperties();
 
+        //TIKA-2055, some ooxml files can include unsigned int/long values
+        //which cause this exception.
+        //For now, catch it and record as '0' because
+        //Word converts to '0' on save.
+        int totalTime = 0;
+        try {
+            totalTime = propsHolder.getTotalTime();
+        } catch (XmlValueOutOfRangeException e) {
+            //swallow for now
+        }
         addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, 
propsHolder.getApplication());
         addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, 
propsHolder.getAppVersion());
         addProperty(metadata, TikaCoreProperties.PUBLISHER, 
propsHolder.getCompany());
@@ -121,7 +132,7 @@ public class MetadataExtractor {
         addProperty(metadata, OfficeOpenXMLExtended.NOTES, 
propsHolder.getNotes());
         addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, 
propsHolder.getPresentationFormat());
         addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, 
propsHolder.getTemplate());
-        addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, 
propsHolder.getTotalTime());
+        addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
 
         if (propsHolder.getPages() > 0) {
             metadata.set(PagedText.N_PAGES, propsHolder.getPages());
@@ -146,7 +157,7 @@ public class MetadataExtractor {
         addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
         addProperty(metadata, Metadata.PRESENTATION_FORMAT, 
propsHolder.getPresentationFormat());
         addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
-        addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
+        addProperty(metadata, Metadata.TOTAL_TIME, totalTime);
         addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
         addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
         addProperty(metadata, MSOffice.PARAGRAPH_COUNT, 
propsHolder.getParagraphs());

http://git-wip-us.apache.org/repos/asf/tika/blob/ae0cb305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index c19abf8..ef9291c 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1255,6 +1255,13 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("<a href=\"http://tika.apache.org/\";>hyper 
<b>link</b></a>", xml);
         assertContains("<a href=\"http://tika.apache.org/\";><b>hyper</b> 
link</a>; bold" , xml);
     }
+
+    @Test
+    public void testLongForIntExceptionInSummaryDetails() throws Exception {
+        //TIKA-2055
+        assertContains("bold", 
getXML("testWORD_totalTimeOutOfRange.docx").xml);
+    }
+
 }
 
 

Reply via email to