Repository: tika Updated Branches: refs/heads/2.x 9f6241161 -> 176f3aded
TIKA 2055 catch exception when totalTime out of unsigned int range in ooxml Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ae0cb305 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ae0cb305 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ae0cb305 Branch: refs/heads/2.x Commit: ae0cb3059a9ae8e21685c344ad6e8a123039f272 Parents: 9f62411 Author: tballison <[email protected]> Authored: Thu Sep 15 11:57:11 2016 -0400 Committer: tballison <[email protected]> Committed: Thu Sep 15 11:57:11 2016 -0400 ---------------------------------------------------------------------- .../parser/microsoft/ooxml/MetadataExtractor.java | 15 +++++++++++++-- .../tika/parser/microsoft/ooxml/OOXMLParserTest.java | 7 +++++++ 2 files changed, 20 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/ae0cb305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java index 25d3596..91d49c7 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java @@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.microsoft.SummaryExtractor; +import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException; import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty; import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties; @@ -113,6 +114,16 @@ public class MetadataExtractor { Metadata metadata) { CTProperties propsHolder = properties.getUnderlyingProperties(); + //TIKA-2055, some ooxml files can include unsigned int/long values + //which cause this exception. + //For now, catch it and record as '0' because + //Word converts to '0' on save. + int totalTime = 0; + try { + totalTime = propsHolder.getTotalTime(); + } catch (XmlValueOutOfRangeException e) { + //swallow for now + } addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication()); addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion()); addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany()); @@ -121,7 +132,7 @@ public class MetadataExtractor { addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes()); addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate()); - addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, propsHolder.getTotalTime()); + addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime); if (propsHolder.getPages() > 0) { metadata.set(PagedText.N_PAGES, propsHolder.getPages()); @@ -146,7 +157,7 @@ public class MetadataExtractor { addProperty(metadata, Metadata.NOTES, propsHolder.getNotes()); addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate()); - addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime()); + addProperty(metadata, Metadata.TOTAL_TIME, totalTime); addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages()); addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides()); addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs()); http://git-wip-us.apache.org/repos/asf/tika/blob/ae0cb305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index c19abf8..ef9291c 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -1255,6 +1255,13 @@ public class OOXMLParserTest extends TikaTest { assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml); assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml); } + + @Test + public void testLongForIntExceptionInSummaryDetails() throws Exception { + //TIKA-2055 + assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx").xml); + } + }
