This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 3806e55ee TIKA-4360 -- improve extraction of mapi metadata (#2073)
3806e55ee is described below
commit 3806e55ee5f16ac10241f1e2e68f2237d6ec576f
Author: Tim Allison <[email protected]>
AuthorDate: Wed Dec 4 15:36:32 2024 -0500
TIKA-4360 -- improve extraction of mapi metadata (#2073)
---
.../main/java/org/apache/tika/metadata/MAPI.java | 63 +++++++
.../main/java/org/apache/tika/metadata/Office.java | 32 ----
.../tika/parser/microsoft/OutlookExtractor.java | 190 ++++++++++++++++-----
.../parser/microsoft/pst/PSTMailItemParser.java | 18 +-
.../tika/parser/microsoft/OutlookParserTest.java | 33 +++-
.../parser/microsoft/libpst/TestLibPstParser.java | 4 +-
.../parser/microsoft/pst/OutlookPSTParserTest.java | 18 +-
7 files changed, 252 insertions(+), 106 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
new file mode 100644
index 000000000..2cf41c7e0
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * Office Document properties collection. These properties apply to
+ * Office / Productivity Documents of all forms, including (but not limited
+ * to) MS Office and OpenDocument formats.
+ * This is a logical collection of properties, which may be drawn from a
+ * few different external definitions.
+ *
+ * @since Apache Tika 1.2
+ */
+public interface MAPI {
+
+ String PREFIX_MAPI_META = "mapi" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+ /**
+ * MAPI message class. What type of .msg/MAPI file is it?
+ */
+ Property MESSAGE_CLASS =
+ Property.internalClosedChoise(PREFIX_MAPI_META + "message-class",
"APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE",
+ "POST", "TASK", "UNKNOWN", "UNSPECIFIED");
+
+ Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META +
"sent-by-server-type");
+
+ Property FROM_REPRESENTING_NAME = Property.internalText(PREFIX_MAPI_META +
"from-representing-name");
+
+ Property FROM_REPRESENTING_EMAIL = Property.internalText(PREFIX_MAPI_META
+ "from-representing-email");
+
+ Property SUBMISSION_ACCEPTED_AT_TIME =
Property.internalDate(PREFIX_MAPI_META + "msg-submission-accepted-at-time");
+
+ Property SUBMISSION_ID = Property.internalText(PREFIX_MAPI_META +
"msg-submission-id");
+
+ Property INTERNET_MESSAGE_ID = Property.internalText(PREFIX_MAPI_META +
"internet-message-id");
+
+ Property INTERNET_REFERENCES = Property.internalTextBag(PREFIX_MAPI_META +
"internet-references");
+
+
+ Property CONVERSATION_TOPIC = Property.internalText(PREFIX_MAPI_META +
"conversation-topic");
+
+ Property CONVERSATION_INDEX = Property.internalText(PREFIX_MAPI_META +
"conversation-index");
+ Property IN_REPLY_TO_ID = Property.internalText(PREFIX_MAPI_META +
"in-reply-to-id");
+
+ Property RECIPIENTS_STRING = Property.internalText(PREFIX_MAPI_META +
"recipients-string");
+ Property IMPORTANCE = Property.internalInteger(PREFIX_MAPI_META +
"importance");
+ Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority");
+ Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META +
"is-flagged");
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 2a9e428eb..aa4b9f002 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -145,30 +145,6 @@ public interface Office {
Property OBJECT_COUNT = Property.internalInteger(
PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"object-count");
- /**
- * MAPI message class. What type of .msg/MAPI file is it?
- */
- Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"mapi-message-class",
- "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK",
"UNKNOWN",
- "UNSPECIFIED");
-
- Property MAPI_SENT_BY_SERVER_TYPE = Property.internalText(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "mapi-sent-by-server-type");
-
- Property MAPI_FROM_REPRESENTING_NAME = Property.internalText(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "mapi-from-representing-name");
-
- Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "mapi-from-representing-email");
-
- Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = Property.internalDate(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "mapi-msg-client-submit-time");
-
/**
* Embedded files may have a "progID" associated with them, such as
* Word.Document.12 or AcroExch.Document.DC
@@ -176,12 +152,4 @@ public interface Office {
Property PROG_ID = Property.internalText("msoffice:progID");
Property OCX_NAME = Property.internalText("msoffice:ocxName");
- Property MAPI_RECIPIENTS_STRING = Property.internalText(PREFIX_DOC_META +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"mapi-recipients-string");
- Property MAPI_IMPORTANCE = Property.internalInteger(PREFIX_DOC_META +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
- Property MAPI_PRIORTY = Property.internalInteger(PREFIX_DOC_META +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
- Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 5a2dc996e..8f381e923 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -24,8 +24,10 @@ import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
+import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
+import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
@@ -34,6 +36,7 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.james.mime4j.codec.DecodeMonitor;
import org.apache.james.mime4j.codec.DecoderUtil;
@@ -44,6 +47,7 @@ import org.apache.poi.hsmf.datatypes.ByteChunk;
import org.apache.poi.hsmf.datatypes.Chunk;
import org.apache.poi.hsmf.datatypes.Chunks;
import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.hsmf.datatypes.MessageSubmissionChunk;
import org.apache.poi.hsmf.datatypes.PropertyValue;
import org.apache.poi.hsmf.datatypes.RecipientChunks;
import org.apache.poi.hsmf.datatypes.StringChunk;
@@ -56,9 +60,9 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -80,17 +84,48 @@ import org.apache.tika.sax.XHTMLContentHandler;
public class OutlookExtractor extends AbstractPOIFSExtractor {
private static final Metadata EMPTY_METADATA = new Metadata();
-
- private static Pattern HEADER_KEY_PAT =
- Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
-
- private final MAPIMessage msg;
+ private static final MAPIProperty[] LITERAL_TIME_MAPI_PROPERTIES = new
MAPIProperty[] {
+ MAPIProperty.CLIENT_SUBMIT_TIME,
+ MAPIProperty.CREATION_TIME,
+ MAPIProperty.DEFERRED_DELIVERY_TIME,
+ MAPIProperty.DELIVER_TIME,
+ //EXPAND BEGIN and EXPAND END?
+ MAPIProperty.EXPIRY_TIME,
+ MAPIProperty.LAST_MODIFICATION_TIME,
+ MAPIProperty.LATEST_DELIVERY_TIME,
+ MAPIProperty.MESSAGE_DELIVERY_TIME,
+ MAPIProperty.MESSAGE_DOWNLOAD_TIME,
+ MAPIProperty.ORIGINAL_DELIVERY_TIME,
+ MAPIProperty.ORIGINAL_SUBMIT_TIME,
+ MAPIProperty.PROVIDER_SUBMIT_TIME,
+ MAPIProperty.RECEIPT_TIME,
+ MAPIProperty.REPLY_TIME,
+ MAPIProperty.REPORT_TIME
+
+ };
+
+ private static final Map<MAPIProperty, Property> LITERAL_TIME_PROPERTIES =
new HashMap<>();
+
+ static {
+ for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES) {
+ String name = property.mapiProperty.toLowerCase(Locale.ROOT);
+ name = name.substring(3);
+ name = name.replace('_', '-');
+ name = MAPI.PREFIX_MAPI_META + name;
+ Property tikaProp = Property.internalDate(name);
+ LITERAL_TIME_PROPERTIES.put(property, tikaProp);
+ }
+ }
//this according to the spec; in practice, it is probably more likely
//that a "split field" fails to start with a space character than
//that a real header contains anything but [-_A-Za-z0-9].
//e.g.
//header: this header goes onto the next line
//<mailto:[email protected]...
+ private static Pattern HEADER_KEY_PAT =
+ Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
+
+ private final MAPIMessage msg;
private final ParseContext parseContext;
private final boolean extractAllAlternatives;
HtmlEncodingDetector detector = new HtmlEncodingDetector();
@@ -158,7 +193,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
msg.setReturnNullOnMissingChunk(true);
try {
- parentMetadata.set(Office.MAPI_MESSAGE_CLASS,
msg.getMessageClassEnum().name());
+ parentMetadata.set(MAPI.MESSAGE_CLASS,
msg.getMessageClassEnum().name());
} catch (ChunkNotFoundException e) {
//swallow
}
@@ -170,15 +205,10 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
// Start with the metadata
- String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
- String from = msg.getDisplayFrom();
handleFromTo(headers, parentMetadata);
-
- parentMetadata.set(TikaCoreProperties.TITLE, subject);
- parentMetadata.set(TikaCoreProperties.SUBJECT,
msg.getConversationTopic());
- parentMetadata.set(TikaCoreProperties.DESCRIPTION,
msg.getConversationTopic());
+ handleMessageInfo(msg, headers, parentMetadata);
try {
for (String recipientAddress :
msg.getRecipientEmailAddressList()) {
@@ -197,35 +227,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
- // Date - try two ways to find it
- // First try via the proper chunk
- if (msg.getMessageDate() != null) {
- parentMetadata.set(TikaCoreProperties.CREATED,
msg.getMessageDate().getTime());
- parentMetadata.set(TikaCoreProperties.MODIFIED,
msg.getMessageDate().getTime());
- } else {
- if (headers != null && headers.size() > 0) {
- for (Map.Entry<String, String[]> header :
headers.entrySet()) {
- String headerKey = header.getKey();
- if
(headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
- String date =
headerKey.substring(headerKey.indexOf(':') + 1).trim();
-
- // See if we can parse it as a normal mail date
- try {
- Date d = MailDateParser.parseDateLenient(date);
- parentMetadata.set(TikaCoreProperties.CREATED,
d);
-
parentMetadata.set(TikaCoreProperties.MODIFIED, d);
- } catch (SecurityException e ) {
- throw e;
- } catch (Exception e) {
- // Store it as-is, and hope for the best...
- parentMetadata.set(TikaCoreProperties.CREATED,
date);
-
parentMetadata.set(TikaCoreProperties.MODIFIED, date);
- }
- break;
- }
- }
- }
- }
+ handleGeneralDates(msg, headers, parentMetadata);
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
@@ -277,6 +279,104 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
+ private void handleMessageInfo(MAPIMessage msg, Map<String, String[]>
headers, Metadata metadata)
+ throws ChunkNotFoundException {
+ //this is the literal subject including "re: "
+ metadata.set(TikaCoreProperties.TITLE, msg.getSubject());
+ //this is the original topic for the thread without the "re: "
+ String topic = msg.getConversationTopic();
+ metadata.set(TikaCoreProperties.SUBJECT, topic);
+ metadata.set(TikaCoreProperties.DESCRIPTION, topic);
+ metadata.set(MAPI.CONVERSATION_TOPIC, topic);
+ Chunks mainChunks = msg.getMainChunks();
+ if (mainChunks != null) {
+ if (mainChunks.getMessageId() != null) {
+ metadata.set(MAPI.INTERNET_MESSAGE_ID, mainChunks
+ .getMessageId()
+ .getValue());
+ }
+
+ List<Chunk> conversationIndex =
mainChunks.getAll().get(MAPIProperty.CONVERSATION_INDEX);
+ if (conversationIndex != null && ! conversationIndex.isEmpty()) {
+ Chunk chunk = conversationIndex.get(0);
+ if (chunk instanceof ByteChunk) {
+ byte[] bytes = ((ByteChunk)chunk).getValue();
+ String hex = Hex.encodeHexString(bytes);
+ metadata.set(MAPI.CONVERSATION_INDEX, hex);
+ }
+ }
+
+ List<Chunk> internetReferences =
mainChunks.getAll().get(MAPIProperty.INTERNET_REFERENCES);
+ if (internetReferences != null) {
+ for (Chunk ref : internetReferences) {
+ if (ref instanceof StringChunk) {
+ metadata.add(MAPI.INTERNET_REFERENCES, ((StringChunk)
ref).getValue());
+ }
+ }
+ }
+ List<Chunk> inReplyToIds =
mainChunks.getAll().get(MAPIProperty.IN_REPLY_TO_ID);
+ if (inReplyToIds != null && ! inReplyToIds.isEmpty()) {
+ metadata.add(MAPI.IN_REPLY_TO_ID,
inReplyToIds.get(0).toString());
+ }
+
+ for (Map.Entry<MAPIProperty, Property> e :
LITERAL_TIME_PROPERTIES.entrySet()) {
+ List<PropertyValue> timeProp =
mainChunks.getProperties().get(e.getKey());
+ if (timeProp != null && ! timeProp.isEmpty()) {
+ Calendar cal =
((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue();
+ metadata.set(e.getValue(), cal);
+ }
+ }
+
+ MessageSubmissionChunk messageSubmissionChunk =
mainChunks.getSubmissionChunk();
+ if (messageSubmissionChunk != null) {
+ String submissionId = messageSubmissionChunk.getSubmissionId();
+ metadata.set(MAPI.SUBMISSION_ID, submissionId);
+ metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME,
messageSubmissionChunk.getAcceptedAtTime());
+ }
+
+ }
+ }
+
+
+ private void handleGeneralDates(MAPIMessage msg, Map<String, String[]>
headers, Metadata metadata) throws ChunkNotFoundException {
+ // Date - try two ways to find it
+ // First try via the proper chunk
+ if (msg.getMessageDate() != null) {
+ metadata.set(TikaCoreProperties.CREATED,
msg.getMessageDate().getTime());
+ metadata.set(TikaCoreProperties.MODIFIED,
msg.getMessageDate().getTime());
+ } else {
+ if (headers != null && headers.size() > 0) {
+ for (Map.Entry<String, String[]> header : headers.entrySet()) {
+ String headerKey = header.getKey();
+ if
(headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
+ String date =
headerKey.substring(headerKey.indexOf(':') + 1).trim();
+
+ // See if we can parse it as a normal mail date
+ try {
+ Date d = MailDateParser.parseDateLenient(date);
+ metadata.set(TikaCoreProperties.CREATED, d);
+ metadata.set(TikaCoreProperties.MODIFIED, d);
+ } catch (SecurityException e ) {
+ throw e;
+ } catch (Exception e) {
+ // Store it as-is, and hope for the best...
+ metadata.set(TikaCoreProperties.CREATED, date);
+ metadata.set(TikaCoreProperties.MODIFIED, date);
+ }
+ break;
+ }
+ }
+ }
+ }
+ //try to overwrite the modified property if the actual
LAST_MODIFICATION_TIME property exists.
+ List<PropertyValue> timeProp =
msg.getMainChunks().getProperties().get(MAPIProperty.LAST_MODIFICATION_TIME);
+ if (timeProp != null && ! timeProp.isEmpty()) {
+ Calendar cal =
((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue();
+ metadata.set(TikaCoreProperties.MODIFIED, cal);
+ }
+
+ }
+
private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
@@ -395,7 +495,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
Chunks chunks = msg.getMainChunks();
StringChunk sentByServerType = chunks.getSentByServerType();
if (sentByServerType != null) {
- metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE,
sentByServerType.getValue());
+ metadata.set(MAPI.SENT_BY_SERVER_TYPE,
sentByServerType.getValue());
}
Map<MAPIProperty, List<Chunk>> mainChunks =
msg.getMainChunks().getAll();
@@ -411,12 +511,12 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME),
Message.MESSAGE_FROM_NAME,
metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME),
- Office.MAPI_FROM_REPRESENTING_NAME, metadata);
+ MAPI.FROM_REPRESENTING_NAME, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS),
Message.MESSAGE_FROM_EMAIL,
metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS),
- Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
+ MAPI.FROM_REPRESENTING_EMAIL, metadata);
for (Recipient recipient : buildRecipients()) {
switch (recipient.recipientType) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index a87c6cb84..f1c9f9e66 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -36,9 +36,9 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.PST;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -139,31 +139,31 @@ public class PSTMailItemParser implements Parser {
metadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
metadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
metadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
- metadata.set(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME,
pstMail.getClientSubmitTime());
+ metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME,
pstMail.getClientSubmitTime());
metadata.set(TikaCoreProperties.MODIFIED,
pstMail.getLastModificationTime());
metadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
metadata.set(PST.DESCRIPTOR_NODE_ID,
valueOf(pstMail.getDescriptorNodeId()));
metadata.set(Message.MESSAGE_FROM_EMAIL,
pstMail.getSenderEmailAddress());
if (! StringUtils.isBlank(pstMail.getRecipientsString()) &&
! pstMail.getRecipientsString().equals("No recipients
table!")) {
- metadata.set(Office.MAPI_RECIPIENTS_STRING,
pstMail.getRecipientsString());
+ metadata.set(MAPI.RECIPIENTS_STRING,
pstMail.getRecipientsString());
}
metadata.set(Message.MESSAGE_TO_DISPLAY_NAME, pstMail.getDisplayTo());
metadata.set(Message.MESSAGE_CC_DISPLAY_NAME, pstMail.getDisplayCC());
metadata.set(Message.MESSAGE_BCC_DISPLAY_NAME,
pstMail.getDisplayBCC());
- metadata.set(Office.MAPI_IMPORTANCE, pstMail.getImportance());
- metadata.set(Office.MAPI_PRIORTY, pstMail.getPriority());
- metadata.set(Office.MAPI_IS_FLAGGED, pstMail.isFlagged());
- metadata.set(Office.MAPI_MESSAGE_CLASS,
+ metadata.set(MAPI.IMPORTANCE, pstMail.getImportance());
+ metadata.set(MAPI.PRIORTY, pstMail.getPriority());
+ metadata.set(MAPI.IS_FLAGGED, pstMail.isFlagged());
+ metadata.set(MAPI.MESSAGE_CLASS,
OutlookExtractor.getMessageClass(pstMail.getMessageClass()));
metadata.set(Message.MESSAGE_FROM_EMAIL,
pstMail.getSenderEmailAddress());
- metadata.set(Office.MAPI_FROM_REPRESENTING_EMAIL,
+ metadata.set(MAPI.FROM_REPRESENTING_EMAIL,
pstMail.getSentRepresentingEmailAddress());
metadata.set(Message.MESSAGE_FROM_NAME, pstMail.getSenderName());
- metadata.set(Office.MAPI_FROM_REPRESENTING_NAME,
pstMail.getSentRepresentingName());
+ metadata.set(MAPI.FROM_REPRESENTING_NAME,
pstMail.getSentRepresentingName());
//add recipient details
try {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index f10f4aa7c..a01bd5a8d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -36,9 +36,9 @@ import org.xml.sax.ContentHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -119,12 +119,12 @@ public class OutlookParserTest extends TikaTest {
" by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29
Jan 2009 11:17:08 " +
"-0800",
Arrays.asList(metadata.getValues("Message:Raw-Header:Received")));
- assertEquals("EX", metadata.get(Office.MAPI_SENT_BY_SERVER_TYPE));
- assertEquals("NOTE", metadata.get(Office.MAPI_MESSAGE_CLASS));
+ assertEquals("EX", metadata.get(MAPI.SENT_BY_SERVER_TYPE));
+ assertEquals("NOTE", metadata.get(MAPI.MESSAGE_CLASS));
assertEquals("Jukka Zitting", metadata.get(Message.MESSAGE_FROM_NAME));
assertEquals("[email protected]",
metadata.get(Message.MESSAGE_FROM_EMAIL));
- assertEquals("Jukka Zitting",
metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
- assertEquals("[email protected]",
metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
+ assertEquals("Jukka Zitting",
metadata.get(MAPI.FROM_REPRESENTING_NAME));
+ assertEquals("[email protected]",
metadata.get(MAPI.FROM_REPRESENTING_EMAIL));
//to-name is empty, make sure that we get an empty string.
assertEquals("[email protected]",
metadata.get(Message.MESSAGE_TO_EMAIL));
@@ -199,9 +199,19 @@ public class OutlookParserTest extends TikaTest {
assertEquals("[email protected]",
metadata.get(Message.MESSAGE_TO_EMAIL));
- assertEquals("Tests Chang@FT (張毓倫)",
metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
+ assertEquals("Tests Chang@FT (張毓倫)",
metadata.get(MAPI.FROM_REPRESENTING_NAME));
assertEquals("/O=FT GROUP/OU=FT/CN=RECIPIENTS/CN=LYDIACHANG",
- metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
+ metadata.get(MAPI.FROM_REPRESENTING_EMAIL));
+
+ assertEquals("c=TW;a= ;p=FT GROUP;l=FTM02-110329085248Z-89735\u0000",
+ metadata.get(MAPI.SUBMISSION_ID));
+
assertEquals("<[email protected]>",
+ metadata.get(MAPI.INTERNET_MESSAGE_ID));
+
assertTrue(metadata.get(MAPI.SUBMISSION_ACCEPTED_AT_TIME).startsWith("2011-03-29"));
+
assertTrue(metadata.get("mapi:client-submit-time").startsWith("2011-03-29"));
+
assertTrue(metadata.get("mapi:message-delivery-time").startsWith("2011-03-29"));
+
assertTrue(metadata.get("mapi:last-modification-time").startsWith("2011-03-29"));
+
assertTrue(metadata.get("mapi:creation-time").startsWith("2011-03-29"));
}
@Test
@@ -224,6 +234,11 @@ public class OutlookParserTest extends TikaTest {
String content = sw.toString();
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
+ assertEquals("01ccb5408a75b6cf3ad7837949b698499034202313ef000002a160",
metadata.get(MAPI.CONVERSATION_INDEX));
+
assertEquals("<c8508767c15dbf40a21693142739ea8d564d18f...@exvmbx018-1.exch018.msoutlookonline.net>",
+ metadata.get(MAPI.INTERNET_REFERENCES));
+
assertEquals("<c8508767c15dbf40a21693142739ea8d564d18f...@exvmbx018-1.exch018.msoutlookonline.net>",
+ metadata.get(MAPI.IN_REPLY_TO_ID));
}
@Test
@@ -289,8 +304,8 @@ public class OutlookParserTest extends TikaTest {
private void testMsgClass(String expected, Metadata metadata) {
assertTrue(expected.equalsIgnoreCase(
-
metadata.get(Office.MAPI_MESSAGE_CLASS).replaceAll("_", "")),
- expected + ", but got: " +
metadata.get(Office.MAPI_MESSAGE_CLASS));
+
metadata.get(MAPI.MESSAGE_CLASS).replaceAll("_", "")),
+ expected + ", but got: " + metadata.get(MAPI.MESSAGE_CLASS));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
index 8e6863596..73e623393 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
@@ -26,9 +26,9 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.PST;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
@@ -78,7 +78,7 @@ public class TestLibPstParser extends TikaTest {
assertEquals("NOTE", metadataList
.get(7)
- .get(Office.MAPI_MESSAGE_CLASS));
+ .get(MAPI.MESSAGE_CLASS));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index 6e9a6d6d1..e73c6c9fe 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -26,9 +26,9 @@ import java.util.List;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.PST;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -74,9 +74,9 @@ public class OutlookPSTParserTest extends TikaTest {
assertEquals("", m1.get(Message.MESSAGE_CC_DISPLAY_NAME));
assertEquals("", m1.get(Message.MESSAGE_BCC_DISPLAY_NAME));
assertEquals("[email protected]", m1.get(Message.MESSAGE_FROM_EMAIL));
- assertEquals("Jörn Kottmann",
m1.get(Office.MAPI_FROM_REPRESENTING_NAME));
- assertEquals("[email protected]",
m1.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
- assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS));
+ assertEquals("Jörn Kottmann", m1.get(MAPI.FROM_REPRESENTING_NAME));
+ assertEquals("[email protected]",
m1.get(MAPI.FROM_REPRESENTING_EMAIL));
+ assertEquals("NOTE", m1.get(MAPI.MESSAGE_CLASS));
assertEquals("/Début du fichier de données Outlook",
m1.get(PST.PST_FOLDER_PATH));
//test that subject is making it into the xhtml
assertContains("<meta name=\"dc:subject\" content=\"Re: Feature
Generators\"", m1.get(TikaCoreProperties.TIKA_CONTENT));
@@ -84,11 +84,11 @@ public class OutlookPSTParserTest extends TikaTest {
Metadata m6 = metadataList.get(6);
assertEquals("Couchbase", m6.get(Message.MESSAGE_FROM_NAME));
assertEquals("[email protected]",
m6.get(Message.MESSAGE_FROM_EMAIL));
- assertEquals("Couchbase", m6.get(Office.MAPI_FROM_REPRESENTING_NAME));
- assertEquals("[email protected]",
m6.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
- assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS));
- assertNull(m1.get(Office.MAPI_RECIPIENTS_STRING));
- assertContains("2014-02-26",
m1.get(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME));
+ assertEquals("Couchbase", m6.get(MAPI.FROM_REPRESENTING_NAME));
+ assertEquals("[email protected]",
m6.get(MAPI.FROM_REPRESENTING_EMAIL));
+ assertEquals("NOTE", m1.get(MAPI.MESSAGE_CLASS));
+ assertNull(m1.get(MAPI.RECIPIENTS_STRING));
+ assertContains("2014-02-26", m1.get(MAPI.SUBMISSION_ACCEPTED_AT_TIME));
//test full EX email
assertEquals(