This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4345-v2
in repository https://gitbox.apache.org/repos/asf/tika.git

commit aed6b2a5c94fae76503e143d4502ee056f741d4c
Author: tallison <[email protected]>
AuthorDate: Thu May 8 10:43:27 2025 -0400

    TIKA-4345 -- add back configurability for injecting headers into the body 
of emails (legacy pre-4.x behavior)
---
 .../tika/parser/microsoft/OfficeParserConfig.java  | 10 ++++++++
 .../tika/parser/microsoft/OutlookExtractor.java    | 28 ++++++++++++++++++++++
 .../tika/parser/microsoft/OutlookParserTest.java   | 17 +++++++++++++
 3 files changed, 55 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 8e761efad..bfa2865e2 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -35,6 +35,8 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXDocxExtractor = false;
     private boolean useSAXPptxExtractor = false;
 
+    private boolean writeSelectHeadersInBody = false;
+
     private boolean extractAllAlternativesFromMSG = false;
     private String dateOverrideFormat = null;
     private int maxOverride = 0;//ignore
@@ -276,6 +278,14 @@ public class OfficeParserConfig implements Serializable {
     public int getMaxOverride() {
         return this.maxOverride;
     }
+
+    public boolean isWriteSelectHeadersInBody() {
+        return writeSelectHeadersInBody;
+    }
+
+    public void setWriteSelectHeadersInBody(boolean writeSelectHeadersInBody) {
+        this.writeSelectHeadersInBody = writeSelectHeadersInBody;
+    }
 }
 
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 0e219dac6..e13234d5c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -267,6 +267,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
 
         handleGeneralDates(msg, headers, parentMetadata);
+        writeSelectHeadersInBody(parentMetadata, msg, xhtml);
 
         // Get the message body. Preference order is: html, rtf, text
         Chunk htmlChunk = null;
@@ -859,6 +860,33 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         return false;
     }
 
+    private void writeSelectHeadersInBody(Metadata metadata, MAPIMessage msg, 
XHTMLContentHandler xhtml)
+            throws SAXException, ChunkNotFoundException {
+        if (! officeParserConfig.isWriteSelectHeadersInBody()) {
+            return;
+        }
+        String subject = metadata.get(TikaCoreProperties.SUBJECT);
+        subject = (subject == null) ? "" : subject;
+        xhtml.element("h1", subject);
+
+        // Output the from and to details in text, as you
+        //  often want them in text form for searching
+        xhtml.startElement("dl");
+        String from = metadata.get(Message.MESSAGE_FROM);
+        if (from != null) {
+            header(xhtml, "From", from);
+        }
+        header(xhtml, "To", msg.getDisplayTo());
+        header(xhtml, "Cc", msg.getDisplayCC());
+        header(xhtml, "Bcc", msg.getDisplayBCC());
+        try {
+            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+        } catch (ChunkNotFoundException e) {
+            //swallow
+        }
+        xhtml.endElement("dl");
+    }
+
     private List<Recipient> buildRecipients() {
         RecipientChunks[] recipientChunks = msg.getRecipientDetailsChunks();
         if (recipientChunks == null) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index ba8a6c64e..9da786e6e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -401,4 +401,21 @@ public class OutlookParserTest extends TikaTest {
         assertContains("annuaires\t \n" + " Synchronisation", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
     }
 
+    @Test
+    public void testHeadersInBody() throws Exception {
+        //test default behavior -- no headers
+        ParseContext parseContext = new ParseContext();
+        String xml = getText("testMSG.msg", new Metadata(), parseContext);
+        assertTrue(xml.startsWith("Hi,"));
+
+        //test configurable behavior (legacy behavior up to Tika 4.x)
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setWriteSelectHeadersInBody(true);
+        parseContext.set(OfficeParserConfig.class, officeParserConfig);
+        xml = getText("testMSG.msg", new Metadata(), parseContext);
+        xml = xml.replaceAll("\\s+", " ");
+        assertTrue(xml.startsWith("MIME registry use cases"));
+        assertContains("From Jukka Zitting", xml);
+    }
+
 }

Reply via email to