This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 37e72fcc94b2729cb1627264f18bc12485b47dbc
Author: tallison <[email protected]>
AuthorDate: Wed Nov 13 09:43:55 2024 -0500

    TIKA-4345 -- allow users to turn off the injection of headers into the 
content stream for MSG files.
---
 CHANGES.txt                                        |  3 ++
 .../tika/parser/microsoft/OfficeParserConfig.java  |  3 +-
 .../tika/parser/microsoft/OutlookParserTest.java   | 33 +++++++++++-----------
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 395f41b07..e8e8472fd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -3,6 +3,9 @@ Release 4.0.0-BETA1 - ???
 
 Release 3.1.0 - ??
 
+   * Allow users to turn off the injection of some headers into the content 
stream of MSG
+     files (TIKA-4345).
+
    * Add a wrapper for Google's magika detector (TIKA-4344).
 
    * Add support for MachO via Alexey Pelykh (TIKA-4309).
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index af69eefa1..863d90619 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -36,7 +36,8 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXPptxExtractor = false;
 
     private boolean extractAllAlternativesFromMSG = false;
-    private boolean writeSelectHeadersInBody = false;
+    //we'll stop doing this in 4.x
+    private boolean writeSelectHeadersInBody = true;
     private String dateOverrideFormat = null;
     private int maxOverride = 0;//ignore
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 686a6657c..56e15439f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -53,19 +53,20 @@ public class OutlookParserTest extends TikaTest {
 
     @Test
     public void testOutlookParsing() throws Exception {
-
-        //test default behavior
+        Metadata metadata = new Metadata();
+        //test turning off header injection
         List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", 
AUTO_DETECT_PARSER,
-                BasicContentHandlerFactory.HANDLER_TYPE.BODY);
+                metadata, configureDontInjectHeaders(),
+                true, BasicContentHandlerFactory.HANDLER_TYPE.BODY);
         assertNotContained("Microsoft Outlook Express 6", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
 
 
-        //test legacy behavior
+        //test default
         ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
+        metadata = new Metadata();
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/test-outlook.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
         }
         assertEquals("application/vnd.ms-outlook", 
metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Microsoft Outlook Express 6", 
metadata.get(TikaCoreProperties.TITLE));
@@ -107,7 +108,7 @@ public class OutlookParserTest extends TikaTest {
         Metadata metadata = new Metadata();
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/testMSG.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
         }
 
         assertEquals("application/vnd.ms-outlook", 
metadata.get(Metadata.CONTENT_TYPE));
@@ -185,7 +186,7 @@ public class OutlookParserTest extends TikaTest {
         handler.setResult(new StreamResult(sw));
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
         }
 
         // As the HTML version should have been processed, ensure
@@ -243,11 +244,6 @@ public class OutlookParserTest extends TikaTest {
     @Test
     public void testOutlookHTMLfromRTF() throws Exception {
 
-        //test default behavior
-        List<Metadata> metadataList = 
getRecursiveMetadata("test-outlook2003.msg");
-        assertNotContained("<dd>New Outlook User</dd>", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
-
-        //test legacy behavior with the configuration set
         Metadata metadata = new Metadata();
 
         // Check the HTML version
@@ -259,7 +255,7 @@ public class OutlookParserTest extends TikaTest {
         handler.setResult(new StreamResult(sw));
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/test-outlook2003.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
         }
 
         // As the HTML version should have been processed, ensure
@@ -280,12 +276,17 @@ public class OutlookParserTest extends TikaTest {
         // Make sure we don't have nested html docs
         assertEquals(2, content.split("<body>").length);
         assertEquals(2, content.split("<\\/body>").length);
+
+        //test configurable behavior
+        List<Metadata> metadataList = 
getRecursiveMetadata("test-outlook2003.msg", AUTO_DETECT_PARSER, new Metadata(),
+                configureDontInjectHeaders(), true);
+        assertNotContained("<dd>New Outlook User</dd>", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
     }
 
-    private ParseContext configureInjectHeaders() {
+    private ParseContext configureDontInjectHeaders() {
         ParseContext parseContext = new ParseContext();
         OfficeParserConfig officeParserConfig = new OfficeParserConfig();
-        officeParserConfig.setWriteSelectHeadersInBody(true);
+        officeParserConfig.setWriteSelectHeadersInBody(false);
         parseContext.set(OfficeParserConfig.class, officeParserConfig);
         return parseContext;
     }

Reply via email to