This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  745f13c   TIKA-1865 - step 1, split out sender name from sender 
email/exchange info where possible in MSG files.
745f13c is described below

commit 745f13cbd0dd2143d3a95e414f399bd73b0e47ab
Author: tballison <[email protected]>
AuthorDate: Wed Mar 1 13:34:37 2017 -0500

    TIKA-1865 - step 1, split out sender name from sender email/exchange info 
where possible in MSG files.
---
 .../java/org/apache/tika/metadata/Message.java     |  32 +++++-
 .../main/java/org/apache/tika/metadata/Office.java |  27 +++++
 .../tika/parser/microsoft/OutlookExtractor.java    | 126 ++++++++++++++++-----
 .../tika/parser/microsoft/OutlookParserTest.java   |  17 ++-
 4 files changed, 170 insertions(+), 32 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
index dad3952..af853c6 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
@@ -16,10 +16,10 @@
  */
 package org.apache.tika.metadata;
 
-import org.apache.tika.Tika;
-
 /**
  * A collection of Message related property names.
+ *
+ * See also {@link Office}'s MAPI-specific properties.
  */
 public interface Message {
     String MESSAGE_PREFIX = "Message"+ Metadata.NAMESPACE_PREFIX_DELIMITER;
@@ -35,4 +35,32 @@ public interface Message {
     String MESSAGE_CC = "Message-Cc";
     
     String MESSAGE_BCC = "Message-Bcc";
+
+    /**
+     * Where possible, we try to separate the name from the email address
+     * in Message files.  This is multivalued for cases where an email is sent
+     * "on behalf of" someone...this is still to be implemented, though.
+     * The name may be an organization name.
+     */
+    Property MESSAGE_FROM_NAME = 
Property.internalTextBag(MESSAGE_PREFIX+"From-Name");
+
+    /**
+     * Where possible, we try to separate the name from the email address
+     * in Message files.  This is multivalued for cases where an email is sent
+     * "on behalf of" someone...this is still to be implemented, though.
+     */
+    Property MESSAGE_FROM_EMAIL = 
Property.internalTextBag(MESSAGE_PREFIX+"From-Email");
+
+    Property MESSAGE_TO_NAME = 
Property.internalTextBag(MESSAGE_PREFIX+"To-Name");
+
+    Property MESSAGE_TO_EMAIL = 
Property.internalTextBag(MESSAGE_PREFIX+"To-Name");
+
+    Property MESSAGE_CC_NAME = 
Property.internalTextBag(MESSAGE_PREFIX+"CC-Name");
+
+    Property MESSAGE_CC_EMAIL = 
Property.internalTextBag(MESSAGE_PREFIX+"CC-Name");
+
+    Property MESSAGE_BCC_NAME = 
Property.internalTextBag(MESSAGE_PREFIX+"CC-Name");
+
+    Property MESSAGE_BCC_EMAIL = 
Property.internalTextBag(MESSAGE_PREFIX+"CC-Name");
+
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 2860487..86a22d8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -130,4 +130,31 @@ public interface Office {
         PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + 
"mapi-message-class",
             "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK", 
"UNKNOWN", "UNSPECIFIED" );
 
+    Property MAPI_EXCHANGE_FROM_O = Property.internalText(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-o");
+
+    Property MAPI_EXCHANGE_FROM_OU = Property.internalText(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-ou");
+
+    Property MAPI_EXCHANGE_FROM_CN = Property.internalTextBag(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-cn");
+
+    Property MAPI_EXCHANGE_FROM_REPRESENTING_O = Property.internalText(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-representing-o");
+
+    Property MAPI_EXCHANGE_FROM_REPRESENTING_OU = Property.internalText(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-representing-ou");
+
+    Property MAPI_EXCHANGE_FROM_REPRESENTING_CN = Property.internalTextBag(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-exchange-from-representing-cn");
+
+    Property MAPI_SENT_BY_SERVER_TYPE = Property.internalText(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-sent-by-server-type");
+
+    Property MAPI_FROM_REPRESENTING_NAME = Property.internalText(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-from-representing-name");
+
+    Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText(
+            
PREFIX_DOC_META+Metadata.NAMESPACE_PREFIX_DELIMITER+"mapi-from-representing-email");
+
 }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 1186eff..1f87183 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -53,10 +53,11 @@ import org.apache.poi.util.CodePageUtil;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Message;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlEncodingDetector;
@@ -76,7 +77,12 @@ import org.xml.sax.SAXException;
 public class OutlookExtractor extends AbstractPOIFSExtractor {
 
 
-    private final static MediaType RTF = MediaType.application("rtf");
+    private final static String RECIPIENTS = "recipients";
+    private final static Pattern EXCHANGE_O = 
Pattern.compile("(?i)/o=([^/]+)");
+    private final static Pattern EXCHANGE_OU = 
Pattern.compile("(?i)/ou=([^/]+)");
+    private final static Pattern EXCHANGE_CN = 
Pattern.compile("(?i)/cn=([^/]+)");
+
+
     private static Pattern HEADER_KEY_PAT =
             Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
     //this according to the spec; in practice, it is probably more likely
@@ -123,13 +129,10 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
 
             // Start with the metadata
             String subject = msg.getSubject();
+            Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
             String from = msg.getDisplayFrom();
 
-            metadata.set(TikaCoreProperties.CREATOR, from);
-            metadata.set(Metadata.MESSAGE_FROM, from);
-            metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
-            metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
-            metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+            handleFromTo(headers, metadata);
 
             metadata.set(TikaCoreProperties.TITLE, subject);
             // TODO: Move to description in Tika 2.0
@@ -143,31 +146,25 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                 }
             } catch (ChunkNotFoundException he) {
             } // Will be fixed in POI 3.7 Final
-            try {
-                Map<String, String[]> headers = 
normalizeHeaders(msg.getHeaders());
-                for (Map.Entry<String, String[]> e : headers.entrySet()) {
-                    String headerKey = e.getKey();
-                    for (String headerValue : e.getValue()) {
-                        
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX+headerKey, headerValue);
-                    }
-                }
-            } catch (ChunkNotFoundException e) {
 
+            for (Map.Entry<String, String[]> e : headers.entrySet()) {
+                String headerKey = e.getKey();
+                for (String headerValue : e.getValue()) {
+                    metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + 
headerKey, headerValue);
+                }
             }
 
-                    // Date - try two ways to find it
+            // Date - try two ways to find it
             // First try via the proper chunk
             if (msg.getMessageDate() != null) {
                 metadata.set(TikaCoreProperties.CREATED, 
msg.getMessageDate().getTime());
                 metadata.set(TikaCoreProperties.MODIFIED, 
msg.getMessageDate().getTime());
             } else {
-                try {
-                    // Failing that try via the raw headers
-                    String[] headers = msg.getHeaders();
-                    if (headers != null && headers.length > 0) {
-                        for (String header : headers) {
-                            if 
(header.toLowerCase(Locale.ROOT).startsWith("date:")) {
-                                String date = 
header.substring(header.indexOf(':') + 1).trim();
+                    if (headers != null && headers.size() > 0) {
+                        for (Map.Entry<String, String[]> header : 
headers.entrySet()) {
+                            String headerKey = header.getKey();
+                            if 
(headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
+                                String date = 
headerKey.substring(headerKey.indexOf(':') + 1).trim();
 
                                 // See if we can parse it as a normal mail date
                                 try {
@@ -183,9 +180,6 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                             }
                         }
                     }
-                } catch (ChunkNotFoundException he) {
-                    // We can't find the date, sorry...
-                }
             }
 
 
@@ -308,7 +302,83 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
     }
 
-    //TODO: replace this with getMessageClassEnum when we upgrad POI
+    private void handleFromTo(Map<String, String[]> headers, Metadata 
metadata) throws ChunkNotFoundException {
+        String from = msg.getDisplayFrom();
+        metadata.set(TikaCoreProperties.CREATOR, from);
+        metadata.set(Metadata.MESSAGE_FROM, from);
+        metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
+        metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
+        metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+
+
+        Chunks chunks = msg.getMainChunks();
+        StringChunk sentByServerType = chunks.getSentByServerType();
+        if (sentByServerType != null) {
+            metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE,
+                    sentByServerType.getValue());
+        }
+
+        Map<MAPIProperty, List<Chunk>> mainChunks = 
msg.getMainChunks().getAll();
+
+        List<Chunk> senderAddresType = 
mainChunks.get(MAPIProperty.SENDER_ADDRTYPE);
+        String senderAddressTypeString = "";
+        if (senderAddresType != null && senderAddresType.size() > 0) {
+            senderAddressTypeString = senderAddresType.get(0).toString();
+        }
+
+        addChunks(mainChunks.get(MAPIProperty.SENDER_NAME), 
Message.MESSAGE_FROM_NAME, metadata);
+        addChunks(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME),
+                Office.MAPI_FROM_REPRESENTING_NAME, metadata);
+        if (senderAddressTypeString.equalsIgnoreCase("ex")) {
+            addExchange(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS),
+                    Office.MAPI_EXCHANGE_FROM_O, Office.MAPI_EXCHANGE_FROM_OU,
+                    Office.MAPI_EXCHANGE_FROM_CN, metadata);
+            
addExchange(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS),
+                    Office.MAPI_EXCHANGE_FROM_REPRESENTING_O, 
Office.MAPI_EXCHANGE_FROM_REPRESENTING_OU,
+                    Office.MAPI_EXCHANGE_FROM_REPRESENTING_CN, metadata);
+        } else {
+            addChunks(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS),
+                    Message.MESSAGE_FROM_EMAIL, metadata);
+            
addChunks(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS),
+                    Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
+        }
+    }
+
+    private void addExchange(List<Chunk> chunks,Property propertyO,
+                             Property propertyOU, Property propertyCN, 
Metadata metadata) {
+        if (chunks == null || chunks.size() == 0) {
+            return;
+        }
+        String exchange = chunks.get(0).toString();
+        if (exchange == null || exchange.length() == 0) {
+            return;
+        }
+        Matcher matcherO = EXCHANGE_O.matcher(exchange);
+        if (matcherO.find()) {
+            metadata.set(propertyO, matcherO.group(1));
+        }
+        Matcher matcherOU = EXCHANGE_OU.matcher(exchange);
+        if (matcherOU.find()) {
+            metadata.set(propertyOU, matcherOU.group(1));
+        }
+
+        Matcher matcherCN = EXCHANGE_CN.matcher(exchange);
+        while (matcherCN.find()) {
+            String cn = matcherCN.group(1);
+            if (!cn.equalsIgnoreCase(RECIPIENTS)) {
+                metadata.add(propertyCN, cn);
+            }
+        }
+    }
+
+    private void addChunks(List<Chunk> chunks, Property property, Metadata 
metadata) {
+        if (chunks == null || chunks.size() == 0) {
+            return;
+        }
+        metadata.set(property, chunks.get(0).toString());
+    }
+
+    //TODO: replace this with getMessageClassEnum when we upgrade POI
     private String getMessageClass(MAPIMessage msg) throws 
ChunkNotFoundException {
         String mc = msg.getMessageClass();
         if (mc == null || mc.trim().length() == 0) {
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 86fd6c7..2ea3bd7 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -31,6 +31,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Message;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -56,7 +57,6 @@ public class OutlookParserTest extends TikaTest {
                 "/test-documents/test-outlook.msg")) {
             parser.parse(stream, handler, metadata, new ParseContext());
         }
-
         assertEquals(
                 "application/vnd.ms-outlook",
                 metadata.get(Metadata.CONTENT_TYPE));
@@ -124,6 +124,12 @@ public class OutlookParserTest extends TikaTest {
         assertContains("from athena.apache.org (HELO athena.apache.org) 
(140.211.11.136)\n" +
                 "    by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Jan 2009 
11:17:08 -0800",
                 
Arrays.asList(metadata.getValues("Message:Raw-Header:Received")));
+        assertEquals("EX", metadata.get(Office.MAPI_SENT_BY_SERVER_TYPE));
+        assertEquals("NOTE", metadata.get(Office.MAPI_MESSAGE_CLASS));
+        assertEquals("Jukka Zitting", metadata.get(Message.MESSAGE_FROM_NAME));
+        assertEquals("[email protected]", 
metadata.get(Message.MESSAGE_FROM_EMAIL));
+        assertEquals("Jukka Zitting", 
metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
+        assertEquals("[email protected]", 
metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
     }
 
     /**
@@ -141,7 +147,6 @@ public class OutlookParserTest extends TikaTest {
                 "/test-documents/test-outlook2003.msg")) {
             parser.parse(stream, handler, metadata, new ParseContext());
         }
-
         assertEquals(
                 "application/vnd.ms-outlook",
                 metadata.get(Metadata.CONTENT_TYPE));
@@ -189,6 +194,14 @@ public class OutlookParserTest extends TikaTest {
         // Make sure that the Chinese actually came through
         assertContains("\u5F35\u6BD3\u502B", 
metadata.get(TikaCoreProperties.CREATOR));
         assertContains("\u9673\u60E0\u73CD", content);
+
+        assertEquals("FT GROUP", metadata.get(Office.MAPI_EXCHANGE_FROM_O));
+        assertEquals("FT", metadata.get(Office.MAPI_EXCHANGE_FROM_OU));
+        assertEquals("LYDIACHANG", metadata.get(Office.MAPI_EXCHANGE_FROM_CN));
+        assertEquals("Tests Chang@FT (張毓倫)", 
metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
+        assertEquals("FT GROUP", 
metadata.get(Office.MAPI_EXCHANGE_FROM_REPRESENTING_O));
+        assertEquals("FT", 
metadata.get(Office.MAPI_EXCHANGE_FROM_REPRESENTING_OU));
+        assertEquals("LYDIACHANG", 
metadata.get(Office.MAPI_EXCHANGE_FROM_REPRESENTING_CN));
     }
 
     @Test

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to