Repository: tika Updated Branches: refs/heads/master 07aea36f7 -> 415381212
TIKA-2013 -- upgrade to POI 3.15-final, make sure to add new close() throughout for MAPIMessage and NPOIFS Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/cc6f6dcc Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/cc6f6dcc Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/cc6f6dcc Branch: refs/heads/master Commit: cc6f6dcc8fed2826ae8093b7a4aed0ddee74dc40 Parents: 07aea36 Author: tballison <talli...@mitre.org> Authored: Wed Sep 21 13:23:49 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Wed Sep 21 13:23:49 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 4 +-- tika-bundle/pom.xml | 2 +- tika-parsers/pom.xml | 2 +- .../parser/microsoft/JackcessExtractor.java | 5 +-- .../tika/parser/microsoft/OfficeParser.java | 38 ++++++++++++-------- .../tika/parser/microsoft/OutlookExtractor.java | 12 +++++-- 6 files changed, 40 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/cc6f6dcc/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 6f0fda6..6597dc9 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 1.14 - ??? + * Upgrade to POI.3-15 (TIKA-2013). + * Upgrade to PDFBox 2.0.3 (TIKA-2051). * Fix hyperlinks with formatting in DOC and DOCX (TIKA-1255 @@ -42,8 +44,6 @@ Release 1.14 - ??? * MBOX (TIKA-2042) * Stata DTA (TIKA-2064) - * Upgrade to PDFBox 2.0.2 (TIKA-1996). - * Add configurable maximum threshold for number of events extracted from the XMP Media Management Schema in JempboxExtractor (TIKA-1999). http://git-wip-us.apache.org/repos/asf/tika/blob/cc6f6dcc/tika-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml index d350f10..02247f1 100644 --- a/tika-bundle/pom.xml +++ b/tika-bundle/pom.xml @@ -126,7 +126,7 @@ <Embed-Dependency> tika-parsers;inline=true, commons-compress, xz, commons-codec, commons-csv, - commons-io, commons-exec, junrar, + commons-io, commons-exec, commons-collections4, junrar, pdfbox,pdfbox-tools,pdfbox-debugger,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on, poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas, curvesapi, http://git-wip-us.apache.org/repos/asf/tika/blob/cc6f6dcc/tika-parsers/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 06fec12..06e2520 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -35,7 +35,7 @@ <url>http://tika.apache.org/</url> <properties> - <poi.version>3.15-beta1</poi.version> + <poi.version>3.15</poi.version> <!-- NOTE: sync codec version with POI --> <codec.version>1.10</codec.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> http://git-wip-us.apache.org/repos/asf/tika/blob/cc6f6dcc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java index 11a88c2..4d45059 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java @@ -326,8 +326,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { - NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream()); - handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml); + try (NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream())) { + handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml); + } } String formatCurrency(Double d, DataType type) { http://git-wip-us.apache.org/repos/asf/tika/blob/cc6f6dcc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index f5f9f3e..b6681aa 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -35,6 +35,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.IOUtils; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -95,26 +96,33 @@ public class OfficeParser extends AbstractParser { final DirectoryNode root; TikaInputStream tstream = TikaInputStream.cast(stream); - if (tstream == null) { - root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot(); - } else { - final Object container = tstream.getOpenContainer(); - if (container instanceof NPOIFSFileSystem) { - root = ((NPOIFSFileSystem) container).getRoot(); - } else if (container instanceof DirectoryNode) { - root = (DirectoryNode) container; + NPOIFSFileSystem mustCloseFs = null; + try { + if (tstream == null) { + mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream)); + root = mustCloseFs.getRoot(); } else { - NPOIFSFileSystem fs; - if (tstream.hasFile()) { - fs = new NPOIFSFileSystem(tstream.getFile(), true); + final Object container = tstream.getOpenContainer(); + if (container instanceof NPOIFSFileSystem) { + root = ((NPOIFSFileSystem) container).getRoot(); + } else if (container instanceof DirectoryNode) { + root = (DirectoryNode) container; } else { - fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); + NPOIFSFileSystem fs = null; + if (tstream.hasFile()) { + fs = new NPOIFSFileSystem(tstream.getFile(), true); + } else { + fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); + } + //tstream will close the fs, no need to close this below + tstream.setOpenContainer(fs); + root = fs.getRoot(); } - tstream.setOpenContainer(fs); - root = fs.getRoot(); } + parse(root, context, metadata, xhtml); + } finally { + IOUtils.closeQuietly(mustCloseFs); } - parse(root, context, metadata, xhtml); xhtml.endDocument(); } http://git-wip-us.apache.org/repos/asf/tika/blob/cc6f6dcc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 14397b9..c40a3f4 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.microsoft; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; @@ -60,8 +62,6 @@ import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; -import static java.nio.charset.StandardCharsets.UTF_8; - /** * Outlook Message Parser. */ @@ -254,6 +254,14 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } catch (ChunkNotFoundException e) { throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); + } finally { + if (msg != null) { + try { + msg.close(); + } catch (IOException e) { + //swallow + } + } } }