This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 745f13c TIKA-1865 - step 1, split out sender name from sender
email/exchange info where possible in MSG files.
745f13c is described below
commit 745f13cbd0dd2143d3a95e414f399bd73b0e47ab
Author: tballison <[email protected]>
AuthorDate: Wed Mar 1 13:34:37 2017 -0500
TIKA-1865 - step 1, split out sender name from sender email/exchange info
where possible in MSG files.
---
.../java/org/apache/tika/metadata/Message.java | 32 +++++-
.../main/java/org/apache/tika/metadata/Office.java | 27 +++++
.../tika/parser/microsoft/OutlookExtractor.java | 126 ++++++++++++++++-----
.../tika/parser/microsoft/OutlookParserTest.java | 17 ++-
4 files changed, 170 insertions(+), 32 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java
b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
index dad3952..af853c6 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
@@ -16,10 +16,10 @@
*/
package org.apache.tika.metadata;
-import org.apache.tika.Tika;
-
/**
* A collection of Message related property names.
+ *
+ * See also {@link Office}'s MAPI-specific properties.
*/
public interface Message {
String MESSAGE_PREFIX = "Message"+ Metadata.NAMESPACE_PREFIX_DELIMITER;
@@ -35,4 +35,32 @@ public interface Message {
String MESSAGE_CC = "Message-Cc";
String MESSAGE_BCC = "Message-Bcc";
+
+ /**
+ * Where possible, we try to separate the name from the email address
+ * in Message files. This is multivalued for cases where an email is sent
+ * "on behalf of" someone...this is still to be implemented, though.
+ * The name may be an organization name.
+ */
+ Property MESSAGE_FROM_NAME =
Property.internalTextBag(MESSAGE_PREFIX+"From-Name");
+
+ /**
+ * Where possible, we try to separate the name from the email address
+ * in Message files. This is multivalued for cases where an email is sent
+ * "on behalf of" someone...this is still to be implemented, though.
+ */
+ Property MESSAGE_FROM_EMAIL =
Property.internalTextBag(MESSAGE_PREFIX+"From-Email");
+
+ Property MESSAGE_TO_NAME =
Property.internalTextBag(MESSAGE_PREFIX+"To-Name");
+
+ Property MESSAGE_TO_EMAIL =
Property.internalTextBag(MESSAGE_PREFIX+"To-Name");
+
+ Property MESSAGE_CC_NAME =
Property.internalTextBag(MESSAGE_PREFIX+"CC-Name");
+
+ Property MESSAGE_CC_EMAIL =
Property.internalTextBag(MESSAGE_PREFIX+"CC-Name");
+
+ Property MESSAGE_BCC_NAME =
Property.internalTextBag(MESSAGE_PREFIX+"CC-Name");
+
+ Property MESSAGE_BCC_EMAIL =
Property.internalTextBag(MESSAGE_PREFIX+"CC-Name");
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 2860487..86a22d8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -130,4 +130,31 @@ public interface Office {
PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER +
"mapi-message-class",
"APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK",
"UNKNOWN", "UNSPECIFIED" );
+ Property MAPI_EXCHANGE_FROM_O = Property.internalText(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-o");
+
+ Property MAPI_EXCHANGE_FROM_OU = Property.internalText(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-ou");
+
+ Property MAPI_EXCHANGE_FROM_CN = Property.internalTextBag(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-cn");
+
+ Property MAPI_EXCHANGE_FROM_REPRESENTING_O = Property.internalText(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-representing-o");
+
+ Property MAPI_EXCHANGE_FROM_REPRESENTING_OU = Property.internalText(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-representing-ou");
+
+ Property MAPI_EXCHANGE_FROM_REPRESENTING_CN = Property.internalTextBag(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-representing-cn");
+
+ Property MAPI_SENT_BY_SERVER_TYPE = Property.internalText(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-sent-by-server-type");
+
+ Property MAPI_FROM_REPRESENTING_NAME = Property.internalText(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-from-representing-name");
+
+ Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText(
+
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-from-representing-email");
+
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 1186eff..1f87183 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -53,10 +53,11 @@ import org.apache.poi.util.CodePageUtil;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlEncodingDetector;
@@ -76,7 +77,12 @@ import org.xml.sax.SAXException;
public class OutlookExtractor extends AbstractPOIFSExtractor {
- private final static MediaType RTF = MediaType.application("rtf");
+ private final static String RECIPIENTS = "recipients";
+ private final static Pattern EXCHANGE_O =
Pattern.compile("(?i)/o=([^/]+)");
+ private final static Pattern EXCHANGE_OU =
Pattern.compile("(?i)/ou=([^/]+)");
+ private final static Pattern EXCHANGE_CN =
Pattern.compile("(?i)/cn=([^/]+)");
+
+
private static Pattern HEADER_KEY_PAT =
Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
//this according to the spec; in practice, it is probably more likely
@@ -123,13 +129,10 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
// Start with the metadata
String subject = msg.getSubject();
+ Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
- metadata.set(TikaCoreProperties.CREATOR, from);
- metadata.set(Metadata.MESSAGE_FROM, from);
- metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
- metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
- metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+ handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
@@ -143,31 +146,25 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
} catch (ChunkNotFoundException he) {
} // Will be fixed in POI 3.7 Final
- try {
- Map<String, String[]> headers =
normalizeHeaders(msg.getHeaders());
- for (Map.Entry<String, String[]> e : headers.entrySet()) {
- String headerKey = e.getKey();
- for (String headerValue : e.getValue()) {
-
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX+headerKey, headerValue);
- }
- }
- } catch (ChunkNotFoundException e) {
+ for (Map.Entry<String, String[]> e : headers.entrySet()) {
+ String headerKey = e.getKey();
+ for (String headerValue : e.getValue()) {
+ metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX +
headerKey, headerValue);
+ }
}
- // Date - try two ways to find it
+ // Date - try two ways to find it
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED,
msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED,
msg.getMessageDate().getTime());
} else {
- try {
- // Failing that try via the raw headers
- String[] headers = msg.getHeaders();
- if (headers != null && headers.length > 0) {
- for (String header : headers) {
- if
(header.toLowerCase(Locale.ROOT).startsWith("date:")) {
- String date =
header.substring(header.indexOf(':') + 1).trim();
+ if (headers != null && headers.size() > 0) {
+ for (Map.Entry<String, String[]> header :
headers.entrySet()) {
+ String headerKey = header.getKey();
+ if
(headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
+ String date =
headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
@@ -183,9 +180,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
}
- } catch (ChunkNotFoundException he) {
- // We can't find the date, sorry...
- }
}
@@ -308,7 +302,83 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
- //TODO: replace this with getMessageClassEnum when we upgrad POI
+ private void handleFromTo(Map<String, String[]> headers, Metadata
metadata) throws ChunkNotFoundException {
+ String from = msg.getDisplayFrom();
+ metadata.set(TikaCoreProperties.CREATOR, from);
+ metadata.set(Metadata.MESSAGE_FROM, from);
+ metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
+ metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
+ metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+
+
+ Chunks chunks = msg.getMainChunks();
+ StringChunk sentByServerType = chunks.getSentByServerType();
+ if (sentByServerType != null) {
+ metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE,
+ sentByServerType.getValue());
+ }
+
+ Map<MAPIProperty, List<Chunk>> mainChunks =
msg.getMainChunks().getAll();
+
+ List<Chunk> senderAddresType =
mainChunks.get(MAPIProperty.SENDER_ADDRTYPE);
+ String senderAddressTypeString = "";
+ if (senderAddresType != null && senderAddresType.size() > 0) {
+ senderAddressTypeString = senderAddresType.get(0).toString();
+ }
+
+ addChunks(mainChunks.get(MAPIProperty.SENDER_NAME),
Message.MESSAGE_FROM_NAME, metadata);
+ addChunks(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME),
+ Office.MAPI_FROM_REPRESENTING_NAME, metadata);
+ if (senderAddressTypeString.equalsIgnoreCase("ex")) {
+ addExchange(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS),
+ Office.MAPI_EXCHANGE_FROM_O, Office.MAPI_EXCHANGE_FROM_OU,
+ Office.MAPI_EXCHANGE_FROM_CN, metadata);
+
addExchange(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS),
+ Office.MAPI_EXCHANGE_FROM_REPRESENTING_O,
Office.MAPI_EXCHANGE_FROM_REPRESENTING_OU,
+ Office.MAPI_EXCHANGE_FROM_REPRESENTING_CN, metadata);
+ } else {
+ addChunks(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS),
+ Message.MESSAGE_FROM_EMAIL, metadata);
+
addChunks(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS),
+ Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
+ }
+ }
+
+ private void addExchange(List<Chunk> chunks,Property propertyO,
+ Property propertyOU, Property propertyCN,
Metadata metadata) {
+ if (chunks == null || chunks.size() == 0) {
+ return;
+ }
+ String exchange = chunks.get(0).toString();
+ if (exchange == null || exchange.length() == 0) {
+ return;
+ }
+ Matcher matcherO = EXCHANGE_O.matcher(exchange);
+ if (matcherO.find()) {
+ metadata.set(propertyO, matcherO.group(1));
+ }
+ Matcher matcherOU = EXCHANGE_OU.matcher(exchange);
+ if (matcherOU.find()) {
+ metadata.set(propertyOU, matcherOU.group(1));
+ }
+
+ Matcher matcherCN = EXCHANGE_CN.matcher(exchange);
+ while (matcherCN.find()) {
+ String cn = matcherCN.group(1);
+ if (!cn.equalsIgnoreCase(RECIPIENTS)) {
+ metadata.add(propertyCN, cn);
+ }
+ }
+ }
+
+ private void addChunks(List<Chunk> chunks, Property property, Metadata
metadata) {
+ if (chunks == null || chunks.size() == 0) {
+ return;
+ }
+ metadata.set(property, chunks.get(0).toString());
+ }
+
+ //TODO: replace this with getMessageClassEnum when we upgrade POI
private String getMessageClass(MAPIMessage msg) throws
ChunkNotFoundException {
String mc = msg.getMessageClass();
if (mc == null || mc.trim().length() == 0) {
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 86fd6c7..2ea3bd7 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -31,6 +31,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -56,7 +57,6 @@ public class OutlookParserTest extends TikaTest {
"/test-documents/test-outlook.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
-
assertEquals(
"application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
@@ -124,6 +124,12 @@ public class OutlookParserTest extends TikaTest {
assertContains("from athena.apache.org (HELO athena.apache.org)
(140.211.11.136)\n" +
" by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Jan 2009
11:17:08 -0800",
Arrays.asList(metadata.getValues("Message:Raw-Header:Received")));
+ assertEquals("EX", metadata.get(Office.MAPI_SENT_BY_SERVER_TYPE));
+ assertEquals("NOTE", metadata.get(Office.MAPI_MESSAGE_CLASS));
+ assertEquals("Jukka Zitting", metadata.get(Message.MESSAGE_FROM_NAME));
+ assertEquals("[email protected]",
metadata.get(Message.MESSAGE_FROM_EMAIL));
+ assertEquals("Jukka Zitting",
metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
+ assertEquals("[email protected]",
metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
}
/**
@@ -141,7 +147,6 @@ public class OutlookParserTest extends TikaTest {
"/test-documents/test-outlook2003.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
-
assertEquals(
"application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
@@ -189,6 +194,14 @@ public class OutlookParserTest extends TikaTest {
// Make sure that the Chinese actually came through
assertContains("\u5F35\u6BD3\u502B",
metadata.get(TikaCoreProperties.CREATOR));
assertContains("\u9673\u60E0\u73CD", content);
+
+ assertEquals("FT GROUP", metadata.get(Office.MAPI_EXCHANGE_FROM_O));
+ assertEquals("FT", metadata.get(Office.MAPI_EXCHANGE_FROM_OU));
+ assertEquals("LYDIACHANG", metadata.get(Office.MAPI_EXCHANGE_FROM_CN));
+ assertEquals("Tests Chang@FT (張毓倫)",
metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
+ assertEquals("FT GROUP",
metadata.get(Office.MAPI_EXCHANGE_FROM_REPRESENTING_O));
+ assertEquals("FT",
metadata.get(Office.MAPI_EXCHANGE_FROM_REPRESENTING_OU));
+ assertEquals("LYDIACHANG",
metadata.get(Office.MAPI_EXCHANGE_FROM_REPRESENTING_CN));
}
@Test
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].