This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new b968dbca8 TIKA-4116 (#1285) b968dbca8 is described below commit b968dbca8929d4f09113285bb5a1609cc5088eb4 Author: Tim Allison <talli...@apache.org> AuthorDate: Tue Aug 15 12:02:06 2023 -0400 TIKA-4116 (#1285) * TIKA-4116 -- don't extract macros from directory nodes --- CHANGES.txt | 2 ++ .../java/org/apache/tika/parser/microsoft/OfficeParser.java | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 85d9e0dad..55bd83671 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -5,6 +5,8 @@ Release 2.8.1 - ??? the PDFParser now throws an EncryptedDocumentException instead of an IOException if the security handler cannot be found (TIKA-4082). + * Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116). + * Changed default decompressConcatenated to true in CompressorParser. Users may revert to legacy behavior via tika-config.xml (TIKA-4048). diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index c3c81e792..c082b30d0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -155,6 +155,7 @@ public class OfficeParser extends AbstractOfficeParser { final DirectoryNode root; TikaInputStream tstream = TikaInputStream.cast(stream); POIFSFileSystem mustCloseFs = null; + boolean isDirectoryNode = false; try { if (tstream == null) { mustCloseFs = new POIFSFileSystem(CloseShieldInputStream.wrap(stream)); @@ -165,6 +166,7 @@ public class OfficeParser extends AbstractOfficeParser { root = ((POIFSFileSystem) container).getRoot(); } else if (container instanceof DirectoryNode) { root = (DirectoryNode) container; + isDirectoryNode = true; } else { POIFSFileSystem fs = null; if (tstream.hasFile()) { @@ -187,8 +189,12 @@ public class OfficeParser extends AbstractOfficeParser { //We might consider not bothering to check for macros in root, //if we know we're processing ppt based on content-type identified in metadata - extractMacros(root.getFileSystem(), xhtml, - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); + if (! isDirectoryNode) { + // if the "root" is a directory node, we assume that the macros have already + // been extracted from the parent's fileSystem -- TIKA-4116 + extractMacros(root.getFileSystem(), xhtml, + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); + } } } finally {