This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  c3383b0   TIKA-2281 -- extract mapi message class
c3383b0 is described below

commit c3383b005f9879019a9a18e6eae362e3498acf54
Author: tballison <[email protected]>
AuthorDate: Wed Mar 1 11:16:13 2017 -0500

    TIKA-2281 -- extract mapi message class
---
 .../apache/tika/batch/BatchProcessDriverCLI.java   |  11 ++++++++-
 .../main/java/org/apache/tika/metadata/Office.java |   8 +++++++
 .../tika/parser/microsoft/OutlookExtractor.java    |  26 +++++++++++++++++++++
 .../tika/parser/microsoft/OutlookParserTest.java   |  20 ++++++++++++++++
 .../test-documents/testMSG_Appointment.msg         | Bin 0 -> 30208 bytes
 .../resources/test-documents/testMSG_Contact.msg   | Bin 0 -> 29184 bytes
 .../test/resources/test-documents/testMSG_Post.msg | Bin 0 -> 21504 bytes
 .../test-documents/testMSG_StickyNote.msg          | Bin 0 -> 10240 bytes
 .../test/resources/test-documents/testMSG_Task.msg | Bin 0 -> 19968 bytes
 9 files changed, 64 insertions(+), 1 deletion(-)

diff --git 
a/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java 
b/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
index b27dd20..50c5053 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
@@ -394,10 +394,19 @@ public class BatchProcessDriverCLI {
 
 
     public static void main(String[] args) throws Exception {
+        final BatchProcessDriverCLI runner = new BatchProcessDriverCLI(args);
+
+        //make absolutely certain that the child process is killed
+        Runtime.getRuntime().addShutdownHook(new Thread() {
+            @Override
+            public void run() {
+                runner.stop();
+            }
+        });
 
-        BatchProcessDriverCLI runner = new BatchProcessDriverCLI(args);
         runner.execute();
         System.out.println("FSBatchProcessDriver has gracefully completed");
         System.exit(0);
     }
+
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index eeaaa4f..2860487 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -122,4 +122,12 @@ public interface Office {
      */
     Property OBJECT_COUNT = Property.internalInteger(
           PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + 
"object-count");
+
+    /**
+     * MAPI message class.  What type of .msg/MAPI file is it?
+     */
+    Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise(
+        PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER + 
"mapi-message-class",
+            "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK", 
"UNKNOWN", "UNSPECIFIED" );
+
 }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index d22379d..1186eff 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -54,6 +54,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
@@ -111,6 +112,9 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         try {
             msg.setReturnNullOnMissingChunk(true);
 
+            String messageClass = getMessageClass(msg);
+            metadata.set(Office.MAPI_MESSAGE_CLASS, messageClass);
+
             // If the message contains strings that aren't stored
             //  as Unicode, try to sort out an encoding for them
             if (msg.has7BitEncodingStrings()) {
@@ -304,6 +308,28 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
     }
 
+    //TODO: replace this with getMessageClassEnum when we upgrad POI
+    private String getMessageClass(MAPIMessage msg) throws 
ChunkNotFoundException {
+        String mc = msg.getMessageClass();
+        if (mc == null || mc.trim().length() == 0) {
+            return "UNSPECIFIED";
+        } else if (mc.equalsIgnoreCase("IPM.Note")) {
+            return "NOTE";
+        } else if (mc.equalsIgnoreCase("IPM.Contact")) {
+            return "CONTACT";
+        } else if (mc.equalsIgnoreCase("IPM.Appointment")) {
+            return "APPOINTMENT";
+        } else if (mc.equalsIgnoreCase("IPM.StickyNote")) {
+            return "STICKY_NOTE";
+        } else if (mc.equalsIgnoreCase("IPM.Task")) {
+            return "TASK";
+        } else if (mc.equalsIgnoreCase("IPM.Post")) {
+            return "POST";
+        } else {
+            return "UNKNOWN";
+        }
+    }
+
     //As of 3.15, POI currently returns header[] by splitting on /\r?\n/
     //this rebuilds headers that are broken up over several lines
     //this also decodes encoded headers.
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index c15308f..86fd6c7 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -32,6 +32,7 @@ import java.util.regex.Pattern;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
@@ -251,4 +252,23 @@ public class OutlookParserTest extends TikaTest {
         assertEquals(2, content.split("<body>").length);
         assertEquals(2, content.split("<\\/body>").length);
     }
+
+    @Test
+    public void testMAPIMessageClasses() throws Exception {
+
+        for (String messageClass : new String[]{
+                "Appointment", "Contact", "Post", "StickyNote", "Task"
+        }) {
+            testMsgClass(messageClass,
+                    getXML("testMSG_" + messageClass + ".msg").metadata);
+        }
+
+        testMsgClass("NOTE", getXML("test-outlook2003.msg").metadata);
+
+    }
+
+    private void testMsgClass(String expected, Metadata metadata) {
+        assertTrue(expected + ", but got: " + 
metadata.get(Office.MAPI_MESSAGE_CLASS),
+                
expected.equalsIgnoreCase(metadata.get(Office.MAPI_MESSAGE_CLASS).replaceAll("_",
 "")));
+    }
 }
diff --git 
a/tika-parsers/src/test/resources/test-documents/testMSG_Appointment.msg 
b/tika-parsers/src/test/resources/test-documents/testMSG_Appointment.msg
new file mode 100644
index 0000000..c124d31
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testMSG_Appointment.msg differ
diff --git a/tika-parsers/src/test/resources/test-documents/testMSG_Contact.msg 
b/tika-parsers/src/test/resources/test-documents/testMSG_Contact.msg
new file mode 100644
index 0000000..d925f3d
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testMSG_Contact.msg differ
diff --git a/tika-parsers/src/test/resources/test-documents/testMSG_Post.msg 
b/tika-parsers/src/test/resources/test-documents/testMSG_Post.msg
new file mode 100644
index 0000000..3dffd09
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testMSG_Post.msg differ
diff --git 
a/tika-parsers/src/test/resources/test-documents/testMSG_StickyNote.msg 
b/tika-parsers/src/test/resources/test-documents/testMSG_StickyNote.msg
new file mode 100644
index 0000000..13873a1
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testMSG_StickyNote.msg differ
diff --git a/tika-parsers/src/test/resources/test-documents/testMSG_Task.msg 
b/tika-parsers/src/test/resources/test-documents/testMSG_Task.msg
new file mode 100644
index 0000000..a2ac9f8
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testMSG_Task.msg differ

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to