This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4c8fa8a1a TIKA-4418 -- actually include title in body when injecting
select metadata into the body for msg files
4c8fa8a1a is described below
commit 4c8fa8a1add15eff6a36a06f2ffaf38b1189723b
Author: tallison <[email protected]>
AuthorDate: Mon May 19 10:31:15 2025 -0400
TIKA-4418 -- actually include title in body when injecting select metadata
into the body for msg files
# Conflicts:
#
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
#
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
---
.../tika/parser/microsoft/OfficeParserConfig.java | 15 ++++++++++++
.../tika/parser/microsoft/OutlookExtractor.java | 28 ++++++++++++++++++++++
.../tika/parser/microsoft/OutlookParserTest.java | 11 +++++++--
3 files changed, 52 insertions(+), 2 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 8e761efad..5eeff69a5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -35,6 +35,8 @@ public class OfficeParserConfig implements Serializable {
private boolean useSAXDocxExtractor = false;
private boolean useSAXPptxExtractor = false;
+ private boolean writeSelectHeadersInBody = false;
+
private boolean extractAllAlternativesFromMSG = false;
private String dateOverrideFormat = null;
private int maxOverride = 0;//ignore
@@ -276,6 +278,19 @@ public class OfficeParserConfig implements Serializable {
public int getMaxOverride() {
return this.maxOverride;
}
+
+ /**
+ * The default changed to <code>false</code> in 4.x. For legacy 3.x
behavior,
+ * set this to <code>true</code>.
+ * @return
+ */
+ public boolean isWriteSelectHeadersInBody() {
+ return writeSelectHeadersInBody;
+ }
+
+ public void setWriteSelectHeadersInBody(boolean writeSelectHeadersInBody) {
+ this.writeSelectHeadersInBody = writeSelectHeadersInBody;
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 0e219dac6..e8351c18d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -267,6 +267,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
handleGeneralDates(msg, headers, parentMetadata);
+ writeSelectHeadersInBody(parentMetadata, msg, xhtml);
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
@@ -859,6 +860,33 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
return false;
}
+ private void writeSelectHeadersInBody(Metadata metadata, MAPIMessage msg,
XHTMLContentHandler xhtml)
+ throws SAXException, ChunkNotFoundException {
+ if (! officeParserConfig.isWriteSelectHeadersInBody()) {
+ return;
+ }
+ String subject = metadata.get(TikaCoreProperties.TITLE);
+ subject = (subject == null) ? "" : subject;
+ xhtml.element("h1", subject);
+
+ // Output the from and to details in text, as you
+ // often want them in text form for searching
+ xhtml.startElement("dl");
+ String from = metadata.get(Message.MESSAGE_FROM);
+ if (from != null) {
+ header(xhtml, "From", from);
+ }
+ header(xhtml, "To", msg.getDisplayTo());
+ header(xhtml, "Cc", msg.getDisplayCC());
+ header(xhtml, "Bcc", msg.getDisplayBCC());
+ try {
+ header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+ } catch (ChunkNotFoundException e) {
+ //swallow
+ }
+ xhtml.endElement("dl");
+ }
+
private List<Recipient> buildRecipients() {
RecipientChunks[] recipientChunks = msg.getRecipientDetailsChunks();
if (recipientChunks == null) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index ba8a6c64e..a40e02b51 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -55,7 +55,6 @@ public class OutlookParserTest extends TikaTest {
@Test
public void testOutlookParsing() throws Exception {
-
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
@@ -88,7 +87,15 @@ public class OutlookParserTest extends TikaTest {
assertNotContained("Microsoft Outlook Express 6", content);
assertNotContained("L'\u00C9quipe Microsoft Outlook Express", content);
assertNotContained("Nouvel utilisateur de Outlook Express", content);
- assertContains("Messagerie et groupes de discussion", content);
+
+
+ //now try with inlining select headers
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setWriteSelectHeadersInBody(true);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ content = getText("test-outlook.msg", new Metadata(), parseContext);
+ assertTrue(content.startsWith("Microsoft Outlook Express 6"));
}
/**