This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/2.x by this push:
       new  24160a1   TIKA-2281    add mapi message type
24160a1 is described below

commit 24160a1c036c4e9942622377cf9afc44483227d9
Author: tballison <[email protected]>
AuthorDate: Wed Mar 1 11:19:58 2017 -0500

    TIKA-2281    add mapi message type
---
 .../main/java/org/apache/tika/metadata/Office.java |   8 +++++++
 .../tika/parser/microsoft/OutlookExtractor.java    |  26 +++++++++++++++++++++
 .../tika/parser/microsoft/OutlookParserTest.java   |  20 ++++++++++++++++
 .../test-documents/testMSG_Appointment.msg         | Bin 0 -> 30208 bytes
 .../resources/test-documents/testMSG_Contact.msg   | Bin 0 -> 29184 bytes
 .../test/resources/test-documents/testMSG_Post.msg | Bin 0 -> 21504 bytes
 .../test-documents/testMSG_StickyNote.msg          | Bin 0 -> 10240 bytes
 .../test/resources/test-documents/testMSG_Task.msg | Bin 0 -> 19968 bytes
 8 files changed, 54 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index eeaaa4f..2860487 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -122,4 +122,12 @@ public interface Office {
      */
     Property OBJECT_COUNT = Property.internalInteger(
           PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + 
"object-count");
+
+    /**
+     * MAPI message class.  What type of .msg/MAPI file is it?
+     */
+    Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise(
+        PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + 
"mapi-message-class",
+            "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK", 
"UNKNOWN", "UNSPECIFIED" );
+
 }
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 614bb5b..5dfd126 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -57,6 +57,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
@@ -116,6 +117,9 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         try {
             msg.setReturnNullOnMissingChunk(true);
 
+            String messageClass = getMessageClass(msg);
+            metadata.set(Office.MAPI_MESSAGE_CLASS, messageClass);
+
             // If the message contains strings that aren't stored
             //  as Unicode, try to sort out an encoding for them
             if (msg.has7BitEncodingStrings()) {
@@ -313,6 +317,28 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
     }
 
+    //TODO: replace this with getMessageClassEnum when we upgrad POI
+    private String getMessageClass(MAPIMessage msg) throws 
ChunkNotFoundException {
+        String mc = msg.getMessageClass();
+        if (mc == null || mc.trim().length() == 0) {
+            return "UNSPECIFIED";
+        } else if (mc.equalsIgnoreCase("IPM.Note")) {
+            return "NOTE";
+        } else if (mc.equalsIgnoreCase("IPM.Contact")) {
+            return "CONTACT";
+        } else if (mc.equalsIgnoreCase("IPM.Appointment")) {
+            return "APPOINTMENT";
+        } else if (mc.equalsIgnoreCase("IPM.StickyNote")) {
+            return "STICKY_NOTE";
+        } else if (mc.equalsIgnoreCase("IPM.Task")) {
+            return "TASK";
+        } else if (mc.equalsIgnoreCase("IPM.Post")) {
+            return "POST";
+        } else {
+            return "UNKNOWN";
+        }
+    }
+
     //As of 3.15, POI currently returns header[] by splitting on /\r?\n/
     //this rebuilds headers that are broken up over several lines
     //this also decodes encoded headers.
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index c15308f..86fd6c7 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -32,6 +32,7 @@ import java.util.regex.Pattern;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
@@ -251,4 +252,23 @@ public class OutlookParserTest extends TikaTest {
         assertEquals(2, content.split("<body>").length);
         assertEquals(2, content.split("<\\/body>").length);
     }
+
+    @Test
+    public void testMAPIMessageClasses() throws Exception {
+
+        for (String messageClass : new String[]{
+                "Appointment", "Contact", "Post", "StickyNote", "Task"
+        }) {
+            testMsgClass(messageClass,
+                    getXML("testMSG_" + messageClass + ".msg").metadata);
+        }
+
+        testMsgClass("NOTE", getXML("test-outlook2003.msg").metadata);
+
+    }
+
+    private void testMsgClass(String expected, Metadata metadata) {
+        assertTrue(expected + ", but got: " + 
metadata.get(Office.MAPI_MESSAGE_CLASS),
+                
expected.equalsIgnoreCase(metadata.get(Office.MAPI_MESSAGE_CLASS).replaceAll("_",
 "")));
+    }
 }
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testMSG_Appointment.msg 
b/tika-test-resources/src/test/resources/test-documents/testMSG_Appointment.msg
new file mode 100644
index 0000000..c124d31
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testMSG_Appointment.msg 
differ
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testMSG_Contact.msg 
b/tika-test-resources/src/test/resources/test-documents/testMSG_Contact.msg
new file mode 100644
index 0000000..d925f3d
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testMSG_Contact.msg 
differ
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testMSG_Post.msg 
b/tika-test-resources/src/test/resources/test-documents/testMSG_Post.msg
new file mode 100644
index 0000000..3dffd09
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testMSG_Post.msg differ
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testMSG_StickyNote.msg 
b/tika-test-resources/src/test/resources/test-documents/testMSG_StickyNote.msg
new file mode 100644
index 0000000..13873a1
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testMSG_StickyNote.msg 
differ
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testMSG_Task.msg 
b/tika-test-resources/src/test/resources/test-documents/testMSG_Task.msg
new file mode 100644
index 0000000..a2ac9f8
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testMSG_Task.msg differ

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to