This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 37e72fcc94b2729cb1627264f18bc12485b47dbc Author: tallison <[email protected]> AuthorDate: Wed Nov 13 09:43:55 2024 -0500 TIKA-4345 -- allow users to turn off the injection of headers into the content stream for MSG files. --- CHANGES.txt | 3 ++ .../tika/parser/microsoft/OfficeParserConfig.java | 3 +- .../tika/parser/microsoft/OutlookParserTest.java | 33 +++++++++++----------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 395f41b07..e8e8472fd 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -3,6 +3,9 @@ Release 4.0.0-BETA1 - ??? Release 3.1.0 - ?? + * Allow users to turn off the injection of some headers into the content stream of MSG + files (TIKA-4345). + * Add a wrapper for Google's magika detector (TIKA-4344). * Add support for MachO via Alexey Pelykh (TIKA-4309). diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index af69eefa1..863d90619 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -36,7 +36,8 @@ public class OfficeParserConfig implements Serializable { private boolean useSAXPptxExtractor = false; private boolean extractAllAlternativesFromMSG = false; - private boolean writeSelectHeadersInBody = false; + //we'll stop doing this in 4.x + private boolean writeSelectHeadersInBody = true; private String dateOverrideFormat = null; private int maxOverride = 0;//ignore diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index 686a6657c..56e15439f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -53,19 +53,20 @@ public class OutlookParserTest extends TikaTest { @Test public void testOutlookParsing() throws Exception { - - //test default behavior + Metadata metadata = new Metadata(); + //test turning off header injection List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", AUTO_DETECT_PARSER, - BasicContentHandlerFactory.HANDLER_TYPE.BODY); + metadata, configureDontInjectHeaders(), + true, BasicContentHandlerFactory.HANDLER_TYPE.BODY); assertNotContained("Microsoft Outlook Express 6", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); - //test legacy behavior + //test default ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); + metadata = new Metadata(); try (InputStream stream = getResourceAsStream("/test-documents/test-outlook.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); } assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE)); @@ -107,7 +108,7 @@ public class OutlookParserTest extends TikaTest { Metadata metadata = new Metadata(); try (InputStream stream = getResourceAsStream("/test-documents/testMSG.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); } assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE)); @@ -185,7 +186,7 @@ public class OutlookParserTest extends TikaTest { handler.setResult(new StreamResult(sw)); try (InputStream stream = getResourceAsStream("/test-documents/testMSG_chinese.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); } // As the HTML version should have been processed, ensure @@ -243,11 +244,6 @@ public class OutlookParserTest extends TikaTest { @Test public void testOutlookHTMLfromRTF() throws Exception { - //test default behavior - List<Metadata> metadataList = getRecursiveMetadata("test-outlook2003.msg"); - assertNotContained("<dd>New Outlook User</dd>", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); - - //test legacy behavior with the configuration set Metadata metadata = new Metadata(); // Check the HTML version @@ -259,7 +255,7 @@ public class OutlookParserTest extends TikaTest { handler.setResult(new StreamResult(sw)); try (InputStream stream = getResourceAsStream("/test-documents/test-outlook2003.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); } // As the HTML version should have been processed, ensure @@ -280,12 +276,17 @@ public class OutlookParserTest extends TikaTest { // Make sure we don't have nested html docs assertEquals(2, content.split("<body>").length); assertEquals(2, content.split("<\\/body>").length); + + //test configurable behavior + List<Metadata> metadataList = getRecursiveMetadata("test-outlook2003.msg", AUTO_DETECT_PARSER, new Metadata(), + configureDontInjectHeaders(), true); + assertNotContained("<dd>New Outlook User</dd>", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); } - private ParseContext configureInjectHeaders() { + private ParseContext configureDontInjectHeaders() { ParseContext parseContext = new ParseContext(); OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setWriteSelectHeadersInBody(true); + officeParserConfig.setWriteSelectHeadersInBody(false); parseContext.set(OfficeParserConfig.class, officeParserConfig); return parseContext; }
