This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4c8fa8a1a TIKA-4418 -- actually include title in body when injecting 
select metadata into the body for msg files
4c8fa8a1a is described below

commit 4c8fa8a1add15eff6a36a06f2ffaf38b1189723b
Author: tallison <[email protected]>
AuthorDate: Mon May 19 10:31:15 2025 -0400

    TIKA-4418 -- actually include title in body when injecting select metadata 
into the body for msg files
    
    # Conflicts:
    #       
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    #       
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
---
 .../tika/parser/microsoft/OfficeParserConfig.java  | 15 ++++++++++++
 .../tika/parser/microsoft/OutlookExtractor.java    | 28 ++++++++++++++++++++++
 .../tika/parser/microsoft/OutlookParserTest.java   | 11 +++++++--
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 8e761efad..5eeff69a5 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -35,6 +35,8 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXDocxExtractor = false;
     private boolean useSAXPptxExtractor = false;
 
+    private boolean writeSelectHeadersInBody = false;
+
     private boolean extractAllAlternativesFromMSG = false;
     private String dateOverrideFormat = null;
     private int maxOverride = 0;//ignore
@@ -276,6 +278,19 @@ public class OfficeParserConfig implements Serializable {
     public int getMaxOverride() {
         return this.maxOverride;
     }
+
+    /**
+     * The default changed to <code>false</code> in 4.x. For legacy 3.x 
behavior,
+     * set this to <code>true</code>.
+     * @return
+     */
+    public boolean isWriteSelectHeadersInBody() {
+        return writeSelectHeadersInBody;
+    }
+
+    public void setWriteSelectHeadersInBody(boolean writeSelectHeadersInBody) {
+        this.writeSelectHeadersInBody = writeSelectHeadersInBody;
+    }
 }
 
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 0e219dac6..e8351c18d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -267,6 +267,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
 
         handleGeneralDates(msg, headers, parentMetadata);
+        writeSelectHeadersInBody(parentMetadata, msg, xhtml);
 
         // Get the message body. Preference order is: html, rtf, text
         Chunk htmlChunk = null;
@@ -859,6 +860,33 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         return false;
     }
 
+    private void writeSelectHeadersInBody(Metadata metadata, MAPIMessage msg, 
XHTMLContentHandler xhtml)
+            throws SAXException, ChunkNotFoundException {
+        if (! officeParserConfig.isWriteSelectHeadersInBody()) {
+            return;
+        }
+        String subject = metadata.get(TikaCoreProperties.TITLE);
+        subject = (subject == null) ? "" : subject;
+        xhtml.element("h1", subject);
+
+        // Output the from and to details in text, as you
+        //  often want them in text form for searching
+        xhtml.startElement("dl");
+        String from = metadata.get(Message.MESSAGE_FROM);
+        if (from != null) {
+            header(xhtml, "From", from);
+        }
+        header(xhtml, "To", msg.getDisplayTo());
+        header(xhtml, "Cc", msg.getDisplayCC());
+        header(xhtml, "Bcc", msg.getDisplayBCC());
+        try {
+            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+        } catch (ChunkNotFoundException e) {
+            //swallow
+        }
+        xhtml.endElement("dl");
+    }
+
     private List<Recipient> buildRecipients() {
         RecipientChunks[] recipientChunks = msg.getRecipientDetailsChunks();
         if (recipientChunks == null) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index ba8a6c64e..a40e02b51 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -55,7 +55,6 @@ public class OutlookParserTest extends TikaTest {
     @Test
     public void testOutlookParsing() throws Exception {
 
-
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
 
@@ -88,7 +87,15 @@ public class OutlookParserTest extends TikaTest {
         assertNotContained("Microsoft Outlook Express 6", content);
         assertNotContained("L'\u00C9quipe Microsoft Outlook Express", content);
         assertNotContained("Nouvel utilisateur de Outlook Express", content);
-        assertContains("Messagerie et groupes de discussion", content);
+
+
+        //now try with inlining select headers
+        ParseContext parseContext = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setWriteSelectHeadersInBody(true);
+        parseContext.set(OfficeParserConfig.class, officeParserConfig);
+        content = getText("test-outlook.msg", new Metadata(), parseContext);
+        assertTrue(content.startsWith("Microsoft Outlook Express 6"));
     }
 
     /**

Reply via email to