This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4345 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 099bdc9684c768000ba1a61297b177976adb7d91 Author: tallison <[email protected]> AuthorDate: Thu Nov 7 09:20:31 2024 -0500 TIKA-4345 -- extract metadata before writing to pstmailitem body so that more metadata is written to the xhtml --- .../org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java | 4 ++-- .../org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index 15b4cf0fa..a87c6cb84 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -87,11 +87,11 @@ public class PSTMailItemParser implements Parser { private void parseMailAndAttachments(PSTMessage pstMsg, XHTMLContentHandler handler, Metadata metadata, ParseContext context, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException, TikaException { + extractMetadata(pstMsg, metadata); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", pstMsg.getInternetMessageId()); handler.startElement("div", attributes); - handler.element("h1", pstMsg.getSubject()); parseMailItem(pstMsg, handler, metadata, context); parseMailAttachments(pstMsg, handler, metadata, context, embeddedExtractor); @@ -100,7 +100,7 @@ public class PSTMailItemParser implements Parser { private void parseMailItem(PSTMessage pstMail, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException { - extractMetadata(pstMail, metadata); + //try the html first. It preserves logical paragraph markers String htmlChunk = pstMail.getBodyHTML(); if (! StringUtils.isBlank(htmlChunk)) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index 8807b4782..6e9a6d6d1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -51,10 +51,9 @@ public class OutlookPSTParserTest extends TikaTest { assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\"")); assertTrue(output.contains("<body><div class=\"email-folder\"><h1>")); - assertTrue(output.contains("<div class=\"embedded\" id=\"<[email protected]>\">" + "<h1>Re: Feature Generators</h1>")); + assertTrue(output.contains("<div class=\"embedded\" id=\"<[email protected]>\">")); assertTrue(output.contains( - "<div class=\"embedded\" id=\"<[email protected]" + ".bf1.yahoo.com>\"><h1>Re: init tokenizer fails: \"Bad type in " + - "putfield/putstatic\"</h1>")); + "<div class=\"embedded\" id=\"<[email protected]" + ".bf1.yahoo.com>\">")); assertTrue(output.contains("Gary Murphy commented on TIKA-1250:")); assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>")); @@ -79,6 +78,8 @@ public class OutlookPSTParserTest extends TikaTest { assertEquals("[email protected]", m1.get(Office.MAPI_FROM_REPRESENTING_EMAIL)); assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS)); assertEquals("/Début du fichier de données Outlook", m1.get(PST.PST_FOLDER_PATH)); + //test that subject is making it into the xhtml + assertContains("<meta name=\"dc:subject\" content=\"Re: Feature Generators\"", m1.get(TikaCoreProperties.TIKA_CONTENT)); Metadata m6 = metadataList.get(6); assertEquals("Couchbase", m6.get(Message.MESSAGE_FROM_NAME));
