Repository: tika Updated Branches: refs/heads/2.x 1b32e3186 -> 32d9ece8d
TIKA-2013 -- upgrade to POI 3.15 -- don't forget to close new NPOIFS and MAPIMessage Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/12b1d435 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/12b1d435 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/12b1d435 Branch: refs/heads/2.x Commit: 12b1d435bbdc5df9d5e396285c83ddeda44240ae Parents: 1b32e31 Author: tballison <talli...@mitre.org> Authored: Wed Sep 21 14:23:00 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Wed Sep 21 14:23:00 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 4 +-- tika-bundle/pom.xml | 4 +-- .../tika-parser-office-bundle/pom.xml | 1 + tika-parser-modules/pom.xml | 2 +- .../parser/microsoft/JackcessExtractor.java | 5 +-- .../tika/parser/microsoft/OfficeParser.java | 38 ++++++++++++-------- .../tika/parser/microsoft/OutlookExtractor.java | 12 +++++-- 7 files changed, 42 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index baee8b4..662217d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,8 @@ Release 2.0 - ??? Release 1.14 - ??? + * Upgrade to POI 3.15-final (TIKA-2013). + * Upgrade to PDFBox 2.0.3 (TIKA-2051). * Prevent OOM/permanent hang on some corrupt CHM files (TIKA-2040). @@ -45,8 +47,6 @@ Release 1.14 - ??? * iCal and vCalendar (TIKA-2006) * MBOX (TIKA-2042) - * Upgrade to PDFBox 2.0.2 (TIKA-1996). - * Add configurable maximum threshold for number of events extracted from the XMP Media Management Schema in JempboxExtractor (TIKA-1999). http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml index 3b7a6ce..e8f3e83 100644 --- a/tika-bundle/pom.xml +++ b/tika-bundle/pom.xml @@ -126,7 +126,7 @@ <Embed-Dependency> tika-parsers;inline=true, commons-compress, xz, commons-codec, commons-csv, - commons-io, commons-exec, junrar, + commons-io, commons-exec, commons-collections4, junrar, pdfbox,pdfbox-tools,pdfbox-debugger,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on, poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas, curvesapi, @@ -444,4 +444,4 @@ <system>Jenkins</system> <url>https://builds.apache.org/job/Tika-trunk/</url> </ciManagement> -</project> +</project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-bundles/tika-parser-office-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-office-bundle/pom.xml b/tika-parser-bundles/tika-parser-office-bundle/pom.xml index f6b2169..1529c97 100644 --- a/tika-parser-bundles/tika-parser-office-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-office-bundle/pom.xml @@ -65,6 +65,7 @@ commons-lang;inline=true, commons-io;inline=true, commons-codec;inline=true, + commons-collections4;inline=true, poi;inline=true, poi-scratchpad;inline=true, poi-ooxml;inline=true, http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml index ef92a7c..dc3b409 100644 --- a/tika-parser-modules/pom.xml +++ b/tika-parser-modules/pom.xml @@ -35,7 +35,7 @@ <url>http://tika.apache.org/</url> <properties> - <poi.version>3.15-beta1</poi.version> + <poi.version>3.15</poi.version> <!-- NOTE: sync codec version with POI --> <codec.version>1.10</codec.version> <pdfbox.version>2.0.3</pdfbox.version> http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java index fb8a2c2..4f26ff0 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java @@ -328,8 +328,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { - NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream()); - handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml); + try (NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream())) { + handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml); + } } String formatCurrency(Double d, DataType type) { http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index f5f9f3e..b6681aa 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.IOUtils; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -95,26 +96,33 @@ public class OfficeParser extends AbstractParser { final DirectoryNode root; TikaInputStream tstream = TikaInputStream.cast(stream); - if (tstream == null) { - root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot(); - } else { - final Object container = tstream.getOpenContainer(); - if (container instanceof NPOIFSFileSystem) { - root = ((NPOIFSFileSystem) container).getRoot(); - } else if (container instanceof DirectoryNode) { - root = (DirectoryNode) container; + NPOIFSFileSystem mustCloseFs = null; + try { + if (tstream == null) { + mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream)); + root = mustCloseFs.getRoot(); } else { - NPOIFSFileSystem fs; - if (tstream.hasFile()) { - fs = new NPOIFSFileSystem(tstream.getFile(), true); + final Object container = tstream.getOpenContainer(); + if (container instanceof NPOIFSFileSystem) { + root = ((NPOIFSFileSystem) container).getRoot(); + } else if (container instanceof DirectoryNode) { + root = (DirectoryNode) container; } else { - fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); + NPOIFSFileSystem fs = null; + if (tstream.hasFile()) { + fs = new NPOIFSFileSystem(tstream.getFile(), true); + } else { + fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); + } + //tstream will close the fs, no need to close this below + tstream.setOpenContainer(fs); + root = fs.getRoot(); } - tstream.setOpenContainer(fs); - root = fs.getRoot(); } + parse(root, context, metadata, xhtml); + } finally { + IOUtils.closeQuietly(mustCloseFs); } - parse(root, context, metadata, xhtml); xhtml.endDocument(); } http://git-wip-us.apache.org/repos/asf/tika/blob/12b1d435/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index a922c5d..74a95e7 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.microsoft; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; @@ -62,8 +64,6 @@ import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; -import static java.nio.charset.StandardCharsets.UTF_8; - /** * Outlook Message Parser. */ @@ -260,6 +260,14 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } catch (ChunkNotFoundException e) { throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); + } finally { + if (msg != null) { + try { + msg.close(); + } catch (IOException e) { + //swallow + } + } } }