This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e2b41ec91 TIKA-4362 -- expand coverage of message classes for MAPI
(#2076)
e2b41ec91 is described below
commit e2b41ec9180584c8ba94ec68d9ca30987efdfa18
Author: Tim Allison <[email protected]>
AuthorDate: Thu Dec 5 12:09:14 2024 -0500
TIKA-4362 -- expand coverage of message classes for MAPI (#2076)
* TIKA-4362 -- expand coverage of message classes for MAPI
---
.../main/java/org/apache/tika/metadata/MAPI.java | 20 +-
.../tika/parser/microsoft/OutlookExtractor.java | 297 +++++++++++----------
.../parser/microsoft/pst/PSTMailItemParser.java | 4 +-
.../main/resources/mapi_message_classes.properties | 37 +++
4 files changed, 212 insertions(+), 146 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
index 2cf41c7e0..57b46307f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -17,13 +17,10 @@
package org.apache.tika.metadata;
/**
- * Office Document properties collection. These properties apply to
- * Office / Productivity Documents of all forms, including (but not limited
- * to) MS Office and OpenDocument formats.
- * This is a logical collection of properties, which may be drawn from a
- * few different external definitions.
*
- * @since Apache Tika 1.2
+ * Properties that typically appear in MSG/PST message format files.
+ *
+ * @since Apache Tika 4.0
*/
public interface MAPI {
@@ -31,10 +28,17 @@ public interface MAPI {
/**
* MAPI message class. What type of .msg/MAPI file is it?
+ * This is normalized via "mapi_message_classes.properties
*/
Property MESSAGE_CLASS =
- Property.internalClosedChoise(PREFIX_MAPI_META + "message-class",
"APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE",
- "POST", "TASK", "UNKNOWN", "UNSPECIFIED");
+ Property.internalText(PREFIX_MAPI_META + "message-class");
+
+ /**
+ * MAPI message class. What type of .msg/MAPI file is it?
+ * This is the raw value that is retrieved from the underlying chunk
+ */
+ Property MESSAGE_CLASS_RAW =
+ Property.internalText(PREFIX_MAPI_META + "message-class-raw");
Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META +
"sent-by-server-type");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 8f381e923..b9c14c115 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -18,7 +18,9 @@ package org.apache.tika.parser.microsoft;
import static java.nio.charset.StandardCharsets.UTF_8;
+import java.io.BufferedReader;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
@@ -106,6 +108,8 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
private static final Map<MAPIProperty, Property> LITERAL_TIME_PROPERTIES =
new HashMap<>();
+ private static final Map<String, String> MESSAGE_CLASSES = new
LinkedHashMap<>();
+
static {
for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES) {
String name = property.mapiProperty.toLowerCase(Locale.ROOT);
@@ -115,7 +119,30 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
Property tikaProp = Property.internalDate(name);
LITERAL_TIME_PROPERTIES.put(property, tikaProp);
}
+
+ try (BufferedReader r = new BufferedReader(
+ new InputStreamReader(
+
OutlookExtractor.class.getResourceAsStream("/mapi_message_classes.properties"),
UTF_8))) {
+ String line = r.readLine();
+ while (line != null) {
+ if (line.isBlank() || line.startsWith("#")) {
+ line = r.readLine();
+ continue;
+ }
+ String[] cols = line.split("\\s+");
+ String lcKey = cols[0].toLowerCase(Locale.ROOT);
+ String value = cols[1];
+ if (MESSAGE_CLASSES.containsKey(lcKey)) {
+ throw new IllegalArgumentException("Can't have duplicate
keys: " + lcKey);
+ }
+ MESSAGE_CLASSES.put(lcKey, value);
+ line = r.readLine();
+ }
+ } catch (IOException e) {
+ throw new IllegalStateException("can't find
mapi_message_classes.properties?!");
+ }
}
+
//this according to the spec; in practice, it is probably more likely
//that a "split field" fails to start with a space character than
//that a real header contains anything but [-_A-Za-z0-9].
@@ -153,134 +180,115 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
private static void setFirstChunk(List<Chunk> chunks, Property property,
Metadata metadata) {
- if (chunks == null || chunks.size() < 1 || chunks.get(0) == null) {
+ if (chunks == null || chunks.isEmpty() || chunks.get(0) == null) {
return;
}
metadata.set(property, chunks.get(0).toString());
}
- private static void addFirstChunk(List<Chunk> chunks, Property property,
Metadata metadata) {
- if (chunks == null || chunks.size() < 1 || chunks.get(0) == null) {
- return;
- }
- metadata.add(property, chunks.get(0).toString());
- }
-
- //Still needed by PSTParser
- public static String getMessageClass(String messageClass) {
- if (messageClass == null || messageClass.trim().length() == 0) {
+ public static String getNormalizedMessageClass(String messageClass) {
+ if (messageClass == null || messageClass.isBlank()) {
return "UNSPECIFIED";
- } else if (messageClass.equalsIgnoreCase("IPM.Note")) {
- return "NOTE";
- } else if (messageClass.equalsIgnoreCase("IPM.Contact")) {
- return "CONTACT";
- } else if (messageClass.equalsIgnoreCase("IPM.Appointment")) {
- return "APPOINTMENT";
- } else if (messageClass.equalsIgnoreCase("IPM.StickyNote")) {
- return "STICKY_NOTE";
- } else if (messageClass.equalsIgnoreCase("IPM.Task")) {
- return "TASK";
- } else if (messageClass.equalsIgnoreCase("IPM.Post")) {
- return "POST";
- } else {
- return "UNKNOWN";
}
+ String lc = messageClass.toLowerCase(Locale.ROOT);
+ if (MESSAGE_CLASSES.containsKey(lc)) {
+ return MESSAGE_CLASSES.get(lc);
+ }
+ return "UNKNOWN";
}
public void parse(XHTMLContentHandler xhtml)
throws TikaException, SAXException, IOException {
try {
- msg.setReturnNullOnMissingChunk(true);
+ _parse(xhtml);
+ } catch (ChunkNotFoundException e) {
+ throw new TikaException("POI MAPIMessage broken - didn't return
null on missing chunk",
+ e);
+ } /*finally {
+ //You'd think you'd want to call msg.close().
+ //Don't do that. That closes down the file system.
+ //If an msg has multiple msg attachments, some of them
+ //can reside in the same file system. After the first
+ //child is read, the fs is closed, and the other children
+ //get a java.nio.channels.ClosedChannelException
+ }*/
+ }
- try {
- parentMetadata.set(MAPI.MESSAGE_CLASS,
msg.getMessageClassEnum().name());
- } catch (ChunkNotFoundException e) {
- //swallow
- }
+ private void _parse(XHTMLContentHandler xhtml) throws TikaException,
SAXException,
+ IOException, ChunkNotFoundException {
+ msg.setReturnNullOnMissingChunk(true);
- // If the message contains strings that aren't stored
- // as Unicode, try to sort out an encoding for them
- if (msg.has7BitEncodingStrings()) {
- guess7BitEncoding(msg);
- }
+ // If the message contains strings that aren't stored
+ // as Unicode, try to sort out an encoding for them
+ if (msg.has7BitEncodingStrings()) {
+ guess7BitEncoding(msg);
+ }
- // Start with the metadata
- Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
+ // Start with the metadata
+ Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
- handleFromTo(headers, parentMetadata);
- handleMessageInfo(msg, headers, parentMetadata);
+ handleFromTo(headers, parentMetadata);
+ handleMessageInfo(msg, headers, parentMetadata);
- try {
- for (String recipientAddress :
msg.getRecipientEmailAddressList()) {
- if (recipientAddress != null) {
- parentMetadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS,
recipientAddress);
- }
+ try {
+ for (String recipientAddress : msg.getRecipientEmailAddressList())
{
+ if (recipientAddress != null) {
+ parentMetadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS,
recipientAddress);
}
- } catch (ChunkNotFoundException he) {
- // Will be fixed in POI 3.7 Final
}
+ } catch (ChunkNotFoundException e) {
+ //you'd think we wouldn't need this. we do.
+ }
- for (Map.Entry<String, String[]> e : headers.entrySet()) {
- String headerKey = e.getKey();
- for (String headerValue : e.getValue()) {
- parentMetadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX +
headerKey, headerValue);
- }
+ for (Map.Entry<String, String[]> e : headers.entrySet()) {
+ String headerKey = e.getKey();
+ for (String headerValue : e.getValue()) {
+ parentMetadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX +
headerKey, headerValue);
}
+ }
- handleGeneralDates(msg, headers, parentMetadata);
+ handleGeneralDates(msg, headers, parentMetadata);
- // Get the message body. Preference order is: html, rtf, text
- Chunk htmlChunk = null;
- Chunk rtfChunk = null;
- Chunk textChunk = null;
- for (Chunk chunk : msg.getMainChunks().getChunks()) {
- if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
- htmlChunk = chunk;
- }
- if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
- rtfChunk = chunk;
- }
- if (chunk.getChunkId() == MAPIProperty.BODY.id) {
- textChunk = chunk;
- }
+ // Get the message body. Preference order is: html, rtf, text
+ Chunk htmlChunk = null;
+ Chunk rtfChunk = null;
+ Chunk textChunk = null;
+ for (Chunk chunk : msg.getMainChunks().getChunks()) {
+ if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
+ htmlChunk = chunk;
}
- handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+ if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
+ rtfChunk = chunk;
+ }
+ if (chunk.getChunkId() == MAPIProperty.BODY.id) {
+ textChunk = chunk;
+ }
+ }
+ handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
- // Process the attachments
- for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
+ // Process the attachments
+ for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
- String filename = null;
- if (attachment.getAttachLongFileName() != null) {
- filename = attachment.getAttachLongFileName().getValue();
- } else if (attachment.getAttachFileName() != null) {
- filename = attachment.getAttachFileName().getValue();
- }
+ String filename = null;
+ if (attachment.getAttachLongFileName() != null) {
+ filename = attachment.getAttachLongFileName().getValue();
+ } else if (attachment.getAttachFileName() != null) {
+ filename = attachment.getAttachFileName().getValue();
+ }
- if (attachment.getAttachData() != null) {
- handleEmbeddedResource(
-
TikaInputStream.get(attachment.getAttachData().getValue()), filename,
- null, null, xhtml, true);
- }
- if (attachment.getAttachmentDirectory() != null) {
-
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(),
filename,
- xhtml, true);
- }
+ if (attachment.getAttachData() != null) {
+ handleEmbeddedResource(
+
TikaInputStream.get(attachment.getAttachData().getValue()), filename,
+ null, null, xhtml, true);
+ }
+ if (attachment.getAttachmentDirectory() != null) {
+
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(),
filename,
+ xhtml, true);
}
- } catch (ChunkNotFoundException e) {
- throw new TikaException("POI MAPIMessage broken - didn't return
null on missing chunk",
- e);
- } finally {
- //You'd think you'd want to call msg.close().
- //Don't do that. That closes down the file system.
- //If an msg has multiple msg attachments, some of them
- //can reside in the same file system. After the first
- //child is read, the fs is closed, and the other children
- //get a java.nio.channels.ClosedChannelException
}
- }
- private void handleMessageInfo(MAPIMessage msg, Map<String, String[]>
headers, Metadata metadata)
- throws ChunkNotFoundException {
+ }
+ private void handleMessageInfo(MAPIMessage msg, Map<String, String[]>
headers, Metadata metadata) throws ChunkNotFoundException {
//this is the literal subject including "re: "
metadata.set(TikaCoreProperties.TITLE, msg.getSubject());
//this is the original topic for the thread without the "re: "
@@ -289,51 +297,66 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
metadata.set(TikaCoreProperties.DESCRIPTION, topic);
metadata.set(MAPI.CONVERSATION_TOPIC, topic);
Chunks mainChunks = msg.getMainChunks();
- if (mainChunks != null) {
- if (mainChunks.getMessageId() != null) {
- metadata.set(MAPI.INTERNET_MESSAGE_ID, mainChunks
- .getMessageId()
- .getValue());
- }
-
- List<Chunk> conversationIndex =
mainChunks.getAll().get(MAPIProperty.CONVERSATION_INDEX);
- if (conversationIndex != null && ! conversationIndex.isEmpty()) {
- Chunk chunk = conversationIndex.get(0);
- if (chunk instanceof ByteChunk) {
- byte[] bytes = ((ByteChunk)chunk).getValue();
- String hex = Hex.encodeHexString(bytes);
- metadata.set(MAPI.CONVERSATION_INDEX, hex);
- }
- }
-
- List<Chunk> internetReferences =
mainChunks.getAll().get(MAPIProperty.INTERNET_REFERENCES);
- if (internetReferences != null) {
- for (Chunk ref : internetReferences) {
- if (ref instanceof StringChunk) {
- metadata.add(MAPI.INTERNET_REFERENCES, ((StringChunk)
ref).getValue());
- }
- }
- }
- List<Chunk> inReplyToIds =
mainChunks.getAll().get(MAPIProperty.IN_REPLY_TO_ID);
- if (inReplyToIds != null && ! inReplyToIds.isEmpty()) {
- metadata.add(MAPI.IN_REPLY_TO_ID,
inReplyToIds.get(0).toString());
+ if (mainChunks == null) {
+ return;
+ }
+ if (mainChunks.getMessageId() != null) {
+ metadata.set(MAPI.INTERNET_MESSAGE_ID, mainChunks
+ .getMessageId()
+ .getValue());
+ }
+
+ String mc = msg.getStringFromChunk(mainChunks.getMessageClass());
+ if (mc != null) {
+ metadata.set(MAPI.MESSAGE_CLASS_RAW, mc);
+ }
+ metadata.set(MAPI.MESSAGE_CLASS, getNormalizedMessageClass(mc));
+ List<Chunk> conversationIndex = mainChunks
+ .getAll()
+ .get(MAPIProperty.CONVERSATION_INDEX);
+ if (conversationIndex != null && !conversationIndex.isEmpty()) {
+ Chunk chunk = conversationIndex.get(0);
+ if (chunk instanceof ByteChunk) {
+ byte[] bytes = ((ByteChunk) chunk).getValue();
+ String hex = Hex.encodeHexString(bytes);
+ metadata.set(MAPI.CONVERSATION_INDEX, hex);
}
+ }
- for (Map.Entry<MAPIProperty, Property> e :
LITERAL_TIME_PROPERTIES.entrySet()) {
- List<PropertyValue> timeProp =
mainChunks.getProperties().get(e.getKey());
- if (timeProp != null && ! timeProp.isEmpty()) {
- Calendar cal =
((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue();
- metadata.set(e.getValue(), cal);
+ List<Chunk> internetReferences = mainChunks
+ .getAll()
+ .get(MAPIProperty.INTERNET_REFERENCES);
+ if (internetReferences != null) {
+ for (Chunk ref : internetReferences) {
+ if (ref instanceof StringChunk) {
+ metadata.add(MAPI.INTERNET_REFERENCES, ((StringChunk)
ref).getValue());
}
}
-
- MessageSubmissionChunk messageSubmissionChunk =
mainChunks.getSubmissionChunk();
- if (messageSubmissionChunk != null) {
- String submissionId = messageSubmissionChunk.getSubmissionId();
- metadata.set(MAPI.SUBMISSION_ID, submissionId);
- metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME,
messageSubmissionChunk.getAcceptedAtTime());
+ }
+ List<Chunk> inReplyToIds = mainChunks
+ .getAll()
+ .get(MAPIProperty.IN_REPLY_TO_ID);
+ if (inReplyToIds != null && !inReplyToIds.isEmpty()) {
+ metadata.add(MAPI.IN_REPLY_TO_ID, inReplyToIds
+ .get(0)
+ .toString());
+ }
+
+ for (Map.Entry<MAPIProperty, Property> e :
LITERAL_TIME_PROPERTIES.entrySet()) {
+ List<PropertyValue> timeProp = mainChunks
+ .getProperties()
+ .get(e.getKey());
+ if (timeProp != null && !timeProp.isEmpty()) {
+ Calendar cal = ((PropertyValue.TimePropertyValue)
timeProp.get(0)).getValue();
+ metadata.set(e.getValue(), cal);
}
+ }
+ MessageSubmissionChunk messageSubmissionChunk =
mainChunks.getSubmissionChunk();
+ if (messageSubmissionChunk != null) {
+ String submissionId = messageSubmissionChunk.getSubmissionId();
+ metadata.set(MAPI.SUBMISSION_ID, submissionId);
+ metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME,
messageSubmissionChunk.getAcceptedAtTime());
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index f1c9f9e66..4b21e5141 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -155,7 +155,9 @@ public class PSTMailItemParser implements Parser {
metadata.set(MAPI.PRIORTY, pstMail.getPriority());
metadata.set(MAPI.IS_FLAGGED, pstMail.isFlagged());
metadata.set(MAPI.MESSAGE_CLASS,
- OutlookExtractor.getMessageClass(pstMail.getMessageClass()));
+
OutlookExtractor.getNormalizedMessageClass(pstMail.getMessageClass()));
+ metadata.set(MAPI.MESSAGE_CLASS_RAW, pstMail.getMessageClass());
+
metadata.set(Message.MESSAGE_FROM_EMAIL,
pstMail.getSenderEmailAddress());
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/mapi_message_classes.properties
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/mapi_message_classes.properties
new file mode 100644
index 000000000..f48fb5afb
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/mapi_message_classes.properties
@@ -0,0 +1,37 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+IPM.Note NOTE
+IPM.Contact CONTACT
+IPM.Appointment APPOINTMENT
+IPM.StickyNote STICKY_NOTE
+IPM.Task TASK
+IPM.Post POST
+IPM.Schedule.Meeting.Request MEETING_REQUEST
+IPM.Schedule.Meeting.Canceled MEETING_CANCELED
+IPM.Schedule.Meeting.Resp.Pos MEETING_RESPONSE_POSITIVE
+IPM.Schedule.Meeting.Resp.Neg MEETING_RESPONSE_NEGATIVE
+IPM.Schedule.Meeting.Resp.Tent MEETING_RESPONSE_TENTATIVE
+IPM.Schedule.Meeting.Notification.Forward MEETING_NOTIFICATION_FORWARD
+IPM.Schedule.Inquiry SCHEDULE_INQUIRY
+IPM.Configuration.MRM CONFIGURATION_MRM
+REPORT.IPM.Note.DR NOTE_DELIVERED
+REPORT.IPM.Note.NDR NOTE_NOT_DELIVERED
+REPORT.IPM.Note.IPNRN IPNRN READ_RECEIPT
+REPORT.IPM.Note.IPNNRN IPNNRN NOT_READ_RECEIPT
+RPM.Note.Rules.OofTemplate.Microsoft OUT_OF_OFFICE_TEMPLATE
+IPM.Microsoft.FolderDesign.NamedView FOLDER_DESIGN_NAMED_VIEW
+# see
https://learn.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxocal/e920fdbf-b561-4dc2-bee7-0c4fd36bd2ac
+IPM.OLE.CLASS.{00061055-0000-0000-C000-000000000046}
RECURRING_EVENT_MEETING_EXCEPTION