This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4345
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 099bdc9684c768000ba1a61297b177976adb7d91
Author: tallison <[email protected]>
AuthorDate: Thu Nov 7 09:20:31 2024 -0500

    TIKA-4345 -- extract metadata before writing to pstmailitem body so that 
more metadata is written to the xhtml
---
 .../org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java    | 4 ++--
 .../org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index 15b4cf0fa..a87c6cb84 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -87,11 +87,11 @@ public class PSTMailItemParser implements Parser {
     private void parseMailAndAttachments(PSTMessage pstMsg, 
XHTMLContentHandler handler, Metadata metadata, ParseContext context,
                                          EmbeddedDocumentExtractor 
embeddedExtractor)
             throws SAXException, IOException, TikaException {
+        extractMetadata(pstMsg, metadata);
         AttributesImpl attributes = new AttributesImpl();
         attributes.addAttribute("", "class", "class", "CDATA", "embedded");
         attributes.addAttribute("", "id", "id", "CDATA", 
pstMsg.getInternetMessageId());
         handler.startElement("div", attributes);
-        handler.element("h1", pstMsg.getSubject());
 
         parseMailItem(pstMsg, handler, metadata, context);
         parseMailAttachments(pstMsg, handler, metadata, context, 
embeddedExtractor);
@@ -100,7 +100,7 @@ public class PSTMailItemParser implements Parser {
 
     private void parseMailItem(PSTMessage pstMail, XHTMLContentHandler xhtml,
                                 Metadata metadata, ParseContext context) 
throws SAXException, IOException, TikaException {
-        extractMetadata(pstMail, metadata);
+
         //try the html first. It preserves logical paragraph markers
         String htmlChunk = pstMail.getBodyHTML();
         if (! StringUtils.isBlank(htmlChunk)) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index 8807b4782..6e9a6d6d1 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -51,10 +51,9 @@ public class OutlookPSTParserTest extends TikaTest {
         assertTrue(output.contains("<meta name=\"Content-Type\" 
content=\"application/vnd.ms-outlook-pst\""));
 
         assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
-        assertTrue(output.contains("<div class=\"embedded\" 
id=\"&lt;[email protected]&gt;\">" + "<h1>Re: Feature 
Generators</h1>"));
+        assertTrue(output.contains("<div class=\"embedded\" 
id=\"&lt;[email protected]&gt;\">"));
         assertTrue(output.contains(
-                "<div class=\"embedded\" 
id=\"&lt;[email protected]" + 
".bf1.yahoo.com&gt;\"><h1>Re: init tokenizer fails: \"Bad type in " +
-                        "putfield/putstatic\"</h1>"));
+                "<div class=\"embedded\" 
id=\"&lt;[email protected]" + 
".bf1.yahoo.com&gt;\">"));
         assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
 
         assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine 
(pour la recherche)</h1>"));
@@ -79,6 +78,8 @@ public class OutlookPSTParserTest extends TikaTest {
         assertEquals("[email protected]", 
m1.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
         assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS));
         assertEquals("/Début du fichier de données Outlook", 
m1.get(PST.PST_FOLDER_PATH));
+        //test that subject is making it into the xhtml
+        assertContains("<meta name=\"dc:subject\" content=\"Re: Feature 
Generators\"", m1.get(TikaCoreProperties.TIKA_CONTENT));
 
         Metadata m6 = metadataList.get(6);
         assertEquals("Couchbase", m6.get(Message.MESSAGE_FROM_NAME));

Reply via email to