This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4360 in repository https://gitbox.apache.org/repos/asf/tika.git
commit e293bb2d36198d8741825fd072d931ddd84bb998 Author: tallison <[email protected]> AuthorDate: Mon Dec 2 15:35:25 2024 -0500 TIKA-4360 -- improve extraction of mapi metadata --- .../main/java/org/apache/tika/metadata/MAPI.java | 63 +++++++ .../main/java/org/apache/tika/metadata/Office.java | 32 ---- .../tika/parser/microsoft/OutlookExtractor.java | 190 ++++++++++++++++----- .../parser/microsoft/pst/PSTMailItemParser.java | 18 +- .../tika/parser/microsoft/OutlookParserTest.java | 33 +++- .../parser/microsoft/libpst/TestLibPstParser.java | 4 +- .../parser/microsoft/pst/OutlookPSTParserTest.java | 18 +- 7 files changed, 252 insertions(+), 106 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java new file mode 100644 index 000000000..2cf41c7e0 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata; + +/** + * Office Document properties collection. These properties apply to + * Office / Productivity Documents of all forms, including (but not limited + * to) MS Office and OpenDocument formats. + * This is a logical collection of properties, which may be drawn from a + * few different external definitions. + * + * @since Apache Tika 1.2 + */ +public interface MAPI { + + String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + + /** + * MAPI message class. What type of .msg/MAPI file is it? + */ + Property MESSAGE_CLASS = + Property.internalClosedChoise(PREFIX_MAPI_META + "message-class", "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", + "POST", "TASK", "UNKNOWN", "UNSPECIFIED"); + + Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META + "sent-by-server-type"); + + Property FROM_REPRESENTING_NAME = Property.internalText(PREFIX_MAPI_META + "from-representing-name"); + + Property FROM_REPRESENTING_EMAIL = Property.internalText(PREFIX_MAPI_META + "from-representing-email"); + + Property SUBMISSION_ACCEPTED_AT_TIME = Property.internalDate(PREFIX_MAPI_META + "msg-submission-accepted-at-time"); + + Property SUBMISSION_ID = Property.internalText(PREFIX_MAPI_META + "msg-submission-id"); + + Property INTERNET_MESSAGE_ID = Property.internalText(PREFIX_MAPI_META + "internet-message-id"); + + Property INTERNET_REFERENCES = Property.internalTextBag(PREFIX_MAPI_META + "internet-references"); + + + Property CONVERSATION_TOPIC = Property.internalText(PREFIX_MAPI_META + "conversation-topic"); + + Property CONVERSATION_INDEX = Property.internalText(PREFIX_MAPI_META + "conversation-index"); + Property IN_REPLY_TO_ID = Property.internalText(PREFIX_MAPI_META + "in-reply-to-id"); + + Property RECIPIENTS_STRING = Property.internalText(PREFIX_MAPI_META + "recipients-string"); + Property IMPORTANCE = Property.internalInteger(PREFIX_MAPI_META + "importance"); + Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority"); + Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META + "is-flagged"); +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 2a9e428eb..aa4b9f002 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -145,30 +145,6 @@ public interface Office { Property OBJECT_COUNT = Property.internalInteger( PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "object-count"); - /** - * MAPI message class. What type of .msg/MAPI file is it? - */ - Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-message-class", - "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK", "UNKNOWN", - "UNSPECIFIED"); - - Property MAPI_SENT_BY_SERVER_TYPE = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "mapi-sent-by-server-type"); - - Property MAPI_FROM_REPRESENTING_NAME = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "mapi-from-representing-name"); - - Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "mapi-from-representing-email"); - - Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = Property.internalDate( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "mapi-msg-client-submit-time"); - /** * Embedded files may have a "progID" associated with them, such as * Word.Document.12 or AcroExch.Document.DC @@ -176,12 +152,4 @@ public interface Office { Property PROG_ID = Property.internalText("msoffice:progID"); Property OCX_NAME = Property.internalText("msoffice:ocxName"); - Property MAPI_RECIPIENTS_STRING = Property.internalText(PREFIX_DOC_META + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-recipients-string"); - Property MAPI_IMPORTANCE = Property.internalInteger(PREFIX_DOC_META + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance"); - Property MAPI_PRIORTY = Property.internalInteger(PREFIX_DOC_META + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance"); - Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 5a2dc996e..8f381e923 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -24,8 +24,10 @@ import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.ArrayList; +import java.util.Calendar; import java.util.Collections; import java.util.Date; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; @@ -34,6 +36,7 @@ import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.james.mime4j.codec.DecodeMonitor; import org.apache.james.mime4j.codec.DecoderUtil; @@ -44,6 +47,7 @@ import org.apache.poi.hsmf.datatypes.ByteChunk; import org.apache.poi.hsmf.datatypes.Chunk; import org.apache.poi.hsmf.datatypes.Chunks; import org.apache.poi.hsmf.datatypes.MAPIProperty; +import org.apache.poi.hsmf.datatypes.MessageSubmissionChunk; import org.apache.poi.hsmf.datatypes.PropertyValue; import org.apache.poi.hsmf.datatypes.RecipientChunks; import org.apache.poi.hsmf.datatypes.StringChunk; @@ -56,9 +60,9 @@ import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -80,17 +84,48 @@ import org.apache.tika.sax.XHTMLContentHandler; public class OutlookExtractor extends AbstractPOIFSExtractor { private static final Metadata EMPTY_METADATA = new Metadata(); - - private static Pattern HEADER_KEY_PAT = - Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z"); - - private final MAPIMessage msg; + private static final MAPIProperty[] LITERAL_TIME_MAPI_PROPERTIES = new MAPIProperty[] { + MAPIProperty.CLIENT_SUBMIT_TIME, + MAPIProperty.CREATION_TIME, + MAPIProperty.DEFERRED_DELIVERY_TIME, + MAPIProperty.DELIVER_TIME, + //EXPAND BEGIN and EXPAND END? + MAPIProperty.EXPIRY_TIME, + MAPIProperty.LAST_MODIFICATION_TIME, + MAPIProperty.LATEST_DELIVERY_TIME, + MAPIProperty.MESSAGE_DELIVERY_TIME, + MAPIProperty.MESSAGE_DOWNLOAD_TIME, + MAPIProperty.ORIGINAL_DELIVERY_TIME, + MAPIProperty.ORIGINAL_SUBMIT_TIME, + MAPIProperty.PROVIDER_SUBMIT_TIME, + MAPIProperty.RECEIPT_TIME, + MAPIProperty.REPLY_TIME, + MAPIProperty.REPORT_TIME + + }; + + private static final Map<MAPIProperty, Property> LITERAL_TIME_PROPERTIES = new HashMap<>(); + + static { + for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES) { + String name = property.mapiProperty.toLowerCase(Locale.ROOT); + name = name.substring(3); + name = name.replace('_', '-'); + name = MAPI.PREFIX_MAPI_META + name; + Property tikaProp = Property.internalDate(name); + LITERAL_TIME_PROPERTIES.put(property, tikaProp); + } + } //this according to the spec; in practice, it is probably more likely //that a "split field" fails to start with a space character than //that a real header contains anything but [-_A-Za-z0-9]. //e.g. //header: this header goes onto the next line //<mailto:[email protected]... + private static Pattern HEADER_KEY_PAT = + Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z"); + + private final MAPIMessage msg; private final ParseContext parseContext; private final boolean extractAllAlternatives; HtmlEncodingDetector detector = new HtmlEncodingDetector(); @@ -158,7 +193,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { msg.setReturnNullOnMissingChunk(true); try { - parentMetadata.set(Office.MAPI_MESSAGE_CLASS, msg.getMessageClassEnum().name()); + parentMetadata.set(MAPI.MESSAGE_CLASS, msg.getMessageClassEnum().name()); } catch (ChunkNotFoundException e) { //swallow } @@ -170,15 +205,10 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } // Start with the metadata - String subject = msg.getSubject(); Map<String, String[]> headers = normalizeHeaders(msg.getHeaders()); - String from = msg.getDisplayFrom(); handleFromTo(headers, parentMetadata); - - parentMetadata.set(TikaCoreProperties.TITLE, subject); - parentMetadata.set(TikaCoreProperties.SUBJECT, msg.getConversationTopic()); - parentMetadata.set(TikaCoreProperties.DESCRIPTION, msg.getConversationTopic()); + handleMessageInfo(msg, headers, parentMetadata); try { for (String recipientAddress : msg.getRecipientEmailAddressList()) { @@ -197,35 +227,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } - // Date - try two ways to find it - // First try via the proper chunk - if (msg.getMessageDate() != null) { - parentMetadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); - parentMetadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime()); - } else { - if (headers != null && headers.size() > 0) { - for (Map.Entry<String, String[]> header : headers.entrySet()) { - String headerKey = header.getKey(); - if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) { - String date = headerKey.substring(headerKey.indexOf(':') + 1).trim(); - - // See if we can parse it as a normal mail date - try { - Date d = MailDateParser.parseDateLenient(date); - parentMetadata.set(TikaCoreProperties.CREATED, d); - parentMetadata.set(TikaCoreProperties.MODIFIED, d); - } catch (SecurityException e ) { - throw e; - } catch (Exception e) { - // Store it as-is, and hope for the best... - parentMetadata.set(TikaCoreProperties.CREATED, date); - parentMetadata.set(TikaCoreProperties.MODIFIED, date); - } - break; - } - } - } - } + handleGeneralDates(msg, headers, parentMetadata); // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; @@ -277,6 +279,104 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } + private void handleMessageInfo(MAPIMessage msg, Map<String, String[]> headers, Metadata metadata) + throws ChunkNotFoundException { + //this is the literal subject including "re: " + metadata.set(TikaCoreProperties.TITLE, msg.getSubject()); + //this is the original topic for the thread without the "re: " + String topic = msg.getConversationTopic(); + metadata.set(TikaCoreProperties.SUBJECT, topic); + metadata.set(TikaCoreProperties.DESCRIPTION, topic); + metadata.set(MAPI.CONVERSATION_TOPIC, topic); + Chunks mainChunks = msg.getMainChunks(); + if (mainChunks != null) { + if (mainChunks.getMessageId() != null) { + metadata.set(MAPI.INTERNET_MESSAGE_ID, mainChunks + .getMessageId() + .getValue()); + } + + List<Chunk> conversationIndex = mainChunks.getAll().get(MAPIProperty.CONVERSATION_INDEX); + if (conversationIndex != null && ! conversationIndex.isEmpty()) { + Chunk chunk = conversationIndex.get(0); + if (chunk instanceof ByteChunk) { + byte[] bytes = ((ByteChunk)chunk).getValue(); + String hex = Hex.encodeHexString(bytes); + metadata.set(MAPI.CONVERSATION_INDEX, hex); + } + } + + List<Chunk> internetReferences = mainChunks.getAll().get(MAPIProperty.INTERNET_REFERENCES); + if (internetReferences != null) { + for (Chunk ref : internetReferences) { + if (ref instanceof StringChunk) { + metadata.add(MAPI.INTERNET_REFERENCES, ((StringChunk) ref).getValue()); + } + } + } + List<Chunk> inReplyToIds = mainChunks.getAll().get(MAPIProperty.IN_REPLY_TO_ID); + if (inReplyToIds != null && ! inReplyToIds.isEmpty()) { + metadata.add(MAPI.IN_REPLY_TO_ID, inReplyToIds.get(0).toString()); + } + + for (Map.Entry<MAPIProperty, Property> e : LITERAL_TIME_PROPERTIES.entrySet()) { + List<PropertyValue> timeProp = mainChunks.getProperties().get(e.getKey()); + if (timeProp != null && ! timeProp.isEmpty()) { + Calendar cal = ((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue(); + metadata.set(e.getValue(), cal); + } + } + + MessageSubmissionChunk messageSubmissionChunk = mainChunks.getSubmissionChunk(); + if (messageSubmissionChunk != null) { + String submissionId = messageSubmissionChunk.getSubmissionId(); + metadata.set(MAPI.SUBMISSION_ID, submissionId); + metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME, messageSubmissionChunk.getAcceptedAtTime()); + } + + } + } + + + private void handleGeneralDates(MAPIMessage msg, Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException { + // Date - try two ways to find it + // First try via the proper chunk + if (msg.getMessageDate() != null) { + metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); + metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime()); + } else { + if (headers != null && headers.size() > 0) { + for (Map.Entry<String, String[]> header : headers.entrySet()) { + String headerKey = header.getKey(); + if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) { + String date = headerKey.substring(headerKey.indexOf(':') + 1).trim(); + + // See if we can parse it as a normal mail date + try { + Date d = MailDateParser.parseDateLenient(date); + metadata.set(TikaCoreProperties.CREATED, d); + metadata.set(TikaCoreProperties.MODIFIED, d); + } catch (SecurityException e ) { + throw e; + } catch (Exception e) { + // Store it as-is, and hope for the best... + metadata.set(TikaCoreProperties.CREATED, date); + metadata.set(TikaCoreProperties.MODIFIED, date); + } + break; + } + } + } + } + //try to overwrite the modified property if the actual LAST_MODIFICATION_TIME property exists. + List<PropertyValue> timeProp = msg.getMainChunks().getProperties().get(MAPIProperty.LAST_MODIFICATION_TIME); + if (timeProp != null && ! timeProp.isEmpty()) { + Calendar cal = ((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue(); + metadata.set(TikaCoreProperties.MODIFIED, cal); + } + + } + private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { @@ -395,7 +495,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { Chunks chunks = msg.getMainChunks(); StringChunk sentByServerType = chunks.getSentByServerType(); if (sentByServerType != null) { - metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE, sentByServerType.getValue()); + metadata.set(MAPI.SENT_BY_SERVER_TYPE, sentByServerType.getValue()); } Map<MAPIProperty, List<Chunk>> mainChunks = msg.getMainChunks().getAll(); @@ -411,12 +511,12 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, metadata); setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), - Office.MAPI_FROM_REPRESENTING_NAME, metadata); + MAPI.FROM_REPRESENTING_NAME, metadata); setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL, metadata); setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), - Office.MAPI_FROM_REPRESENTING_EMAIL, metadata); + MAPI.FROM_REPRESENTING_EMAIL, metadata); for (Recipient recipient : buildRecipients()) { switch (recipient.recipientType) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index a87c6cb84..f1c9f9e66 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -36,9 +36,9 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; import org.apache.tika.metadata.PST; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -139,31 +139,31 @@ public class PSTMailItemParser implements Parser { metadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName()); metadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName()); metadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime()); - metadata.set(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME, pstMail.getClientSubmitTime()); + metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME, pstMail.getClientSubmitTime()); metadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime()); metadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment()); metadata.set(PST.DESCRIPTOR_NODE_ID, valueOf(pstMail.getDescriptorNodeId())); metadata.set(Message.MESSAGE_FROM_EMAIL, pstMail.getSenderEmailAddress()); if (! StringUtils.isBlank(pstMail.getRecipientsString()) && ! pstMail.getRecipientsString().equals("No recipients table!")) { - metadata.set(Office.MAPI_RECIPIENTS_STRING, pstMail.getRecipientsString()); + metadata.set(MAPI.RECIPIENTS_STRING, pstMail.getRecipientsString()); } metadata.set(Message.MESSAGE_TO_DISPLAY_NAME, pstMail.getDisplayTo()); metadata.set(Message.MESSAGE_CC_DISPLAY_NAME, pstMail.getDisplayCC()); metadata.set(Message.MESSAGE_BCC_DISPLAY_NAME, pstMail.getDisplayBCC()); - metadata.set(Office.MAPI_IMPORTANCE, pstMail.getImportance()); - metadata.set(Office.MAPI_PRIORTY, pstMail.getPriority()); - metadata.set(Office.MAPI_IS_FLAGGED, pstMail.isFlagged()); - metadata.set(Office.MAPI_MESSAGE_CLASS, + metadata.set(MAPI.IMPORTANCE, pstMail.getImportance()); + metadata.set(MAPI.PRIORTY, pstMail.getPriority()); + metadata.set(MAPI.IS_FLAGGED, pstMail.isFlagged()); + metadata.set(MAPI.MESSAGE_CLASS, OutlookExtractor.getMessageClass(pstMail.getMessageClass())); metadata.set(Message.MESSAGE_FROM_EMAIL, pstMail.getSenderEmailAddress()); - metadata.set(Office.MAPI_FROM_REPRESENTING_EMAIL, + metadata.set(MAPI.FROM_REPRESENTING_EMAIL, pstMail.getSentRepresentingEmailAddress()); metadata.set(Message.MESSAGE_FROM_NAME, pstMail.getSenderName()); - metadata.set(Office.MAPI_FROM_REPRESENTING_NAME, pstMail.getSentRepresentingName()); + metadata.set(MAPI.FROM_REPRESENTING_NAME, pstMail.getSentRepresentingName()); //add recipient details try { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index f10f4aa7c..a01bd5a8d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -36,9 +36,9 @@ import org.xml.sax.ContentHandler; import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -119,12 +119,12 @@ public class OutlookParserTest extends TikaTest { " by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Jan 2009 11:17:08 " + "-0800", Arrays.asList(metadata.getValues("Message:Raw-Header:Received"))); - assertEquals("EX", metadata.get(Office.MAPI_SENT_BY_SERVER_TYPE)); - assertEquals("NOTE", metadata.get(Office.MAPI_MESSAGE_CLASS)); + assertEquals("EX", metadata.get(MAPI.SENT_BY_SERVER_TYPE)); + assertEquals("NOTE", metadata.get(MAPI.MESSAGE_CLASS)); assertEquals("Jukka Zitting", metadata.get(Message.MESSAGE_FROM_NAME)); assertEquals("[email protected]", metadata.get(Message.MESSAGE_FROM_EMAIL)); - assertEquals("Jukka Zitting", metadata.get(Office.MAPI_FROM_REPRESENTING_NAME)); - assertEquals("[email protected]", metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL)); + assertEquals("Jukka Zitting", metadata.get(MAPI.FROM_REPRESENTING_NAME)); + assertEquals("[email protected]", metadata.get(MAPI.FROM_REPRESENTING_EMAIL)); //to-name is empty, make sure that we get an empty string. assertEquals("[email protected]", metadata.get(Message.MESSAGE_TO_EMAIL)); @@ -199,9 +199,19 @@ public class OutlookParserTest extends TikaTest { assertEquals("[email protected]", metadata.get(Message.MESSAGE_TO_EMAIL)); - assertEquals("Tests Chang@FT (張毓倫)", metadata.get(Office.MAPI_FROM_REPRESENTING_NAME)); + assertEquals("Tests Chang@FT (張毓倫)", metadata.get(MAPI.FROM_REPRESENTING_NAME)); assertEquals("/O=FT GROUP/OU=FT/CN=RECIPIENTS/CN=LYDIACHANG", - metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL)); + metadata.get(MAPI.FROM_REPRESENTING_EMAIL)); + + assertEquals("c=TW;a= ;p=FT GROUP;l=FTM02-110329085248Z-89735\u0000", + metadata.get(MAPI.SUBMISSION_ID)); + assertEquals("<[email protected]>", + metadata.get(MAPI.INTERNET_MESSAGE_ID)); + assertTrue(metadata.get(MAPI.SUBMISSION_ACCEPTED_AT_TIME).startsWith("2011-03-29")); + assertTrue(metadata.get("mapi:client-submit-time").startsWith("2011-03-29")); + assertTrue(metadata.get("mapi:message-delivery-time").startsWith("2011-03-29")); + assertTrue(metadata.get("mapi:last-modification-time").startsWith("2011-03-29")); + assertTrue(metadata.get("mapi:creation-time").startsWith("2011-03-29")); } @Test @@ -224,6 +234,11 @@ public class OutlookParserTest extends TikaTest { String content = sw.toString(); assertEquals(2, content.split("<body>").length); assertEquals(2, content.split("<\\/body>").length); + assertEquals("01ccb5408a75b6cf3ad7837949b698499034202313ef000002a160", metadata.get(MAPI.CONVERSATION_INDEX)); + assertEquals("<c8508767c15dbf40a21693142739ea8d564d18f...@exvmbx018-1.exch018.msoutlookonline.net>", + metadata.get(MAPI.INTERNET_REFERENCES)); + assertEquals("<c8508767c15dbf40a21693142739ea8d564d18f...@exvmbx018-1.exch018.msoutlookonline.net>", + metadata.get(MAPI.IN_REPLY_TO_ID)); } @Test @@ -289,8 +304,8 @@ public class OutlookParserTest extends TikaTest { private void testMsgClass(String expected, Metadata metadata) { assertTrue(expected.equalsIgnoreCase( - metadata.get(Office.MAPI_MESSAGE_CLASS).replaceAll("_", "")), - expected + ", but got: " + metadata.get(Office.MAPI_MESSAGE_CLASS)); + metadata.get(MAPI.MESSAGE_CLASS).replaceAll("_", "")), + expected + ", but got: " + metadata.get(MAPI.MESSAGE_CLASS)); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java index 8e6863596..73e623393 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java @@ -26,9 +26,9 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; import org.apache.tika.metadata.PST; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; @@ -78,7 +78,7 @@ public class TestLibPstParser extends TikaTest { assertEquals("NOTE", metadataList .get(7) - .get(Office.MAPI_MESSAGE_CLASS)); + .get(MAPI.MESSAGE_CLASS)); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index 6e9a6d6d1..e73c6c9fe 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -26,9 +26,9 @@ import java.util.List; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; +import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; import org.apache.tika.metadata.PST; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -74,9 +74,9 @@ public class OutlookPSTParserTest extends TikaTest { assertEquals("", m1.get(Message.MESSAGE_CC_DISPLAY_NAME)); assertEquals("", m1.get(Message.MESSAGE_BCC_DISPLAY_NAME)); assertEquals("[email protected]", m1.get(Message.MESSAGE_FROM_EMAIL)); - assertEquals("Jörn Kottmann", m1.get(Office.MAPI_FROM_REPRESENTING_NAME)); - assertEquals("[email protected]", m1.get(Office.MAPI_FROM_REPRESENTING_EMAIL)); - assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS)); + assertEquals("Jörn Kottmann", m1.get(MAPI.FROM_REPRESENTING_NAME)); + assertEquals("[email protected]", m1.get(MAPI.FROM_REPRESENTING_EMAIL)); + assertEquals("NOTE", m1.get(MAPI.MESSAGE_CLASS)); assertEquals("/Début du fichier de données Outlook", m1.get(PST.PST_FOLDER_PATH)); //test that subject is making it into the xhtml assertContains("<meta name=\"dc:subject\" content=\"Re: Feature Generators\"", m1.get(TikaCoreProperties.TIKA_CONTENT)); @@ -84,11 +84,11 @@ public class OutlookPSTParserTest extends TikaTest { Metadata m6 = metadataList.get(6); assertEquals("Couchbase", m6.get(Message.MESSAGE_FROM_NAME)); assertEquals("[email protected]", m6.get(Message.MESSAGE_FROM_EMAIL)); - assertEquals("Couchbase", m6.get(Office.MAPI_FROM_REPRESENTING_NAME)); - assertEquals("[email protected]", m6.get(Office.MAPI_FROM_REPRESENTING_EMAIL)); - assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS)); - assertNull(m1.get(Office.MAPI_RECIPIENTS_STRING)); - assertContains("2014-02-26", m1.get(Office.MAPI_MESSAGE_CLIENT_SUBMIT_TIME)); + assertEquals("Couchbase", m6.get(MAPI.FROM_REPRESENTING_NAME)); + assertEquals("[email protected]", m6.get(MAPI.FROM_REPRESENTING_EMAIL)); + assertEquals("NOTE", m1.get(MAPI.MESSAGE_CLASS)); + assertNull(m1.get(MAPI.RECIPIENTS_STRING)); + assertContains("2014-02-26", m1.get(MAPI.SUBMISSION_ACCEPTED_AT_TIME)); //test full EX email assertEquals(
