This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new c3383b0 TIKA-2281 -- extract mapi message class
c3383b0 is described below
commit c3383b005f9879019a9a18e6eae362e3498acf54
Author: tballison <[email protected]>
AuthorDate: Wed Mar 1 11:16:13 2017 -0500
TIKA-2281 -- extract mapi message class
---
.../apache/tika/batch/BatchProcessDriverCLI.java | 11 ++++++++-
.../main/java/org/apache/tika/metadata/Office.java | 8 +++++++
.../tika/parser/microsoft/OutlookExtractor.java | 26 +++++++++++++++++++++
.../tika/parser/microsoft/OutlookParserTest.java | 20 ++++++++++++++++
.../test-documents/testMSG_Appointment.msg | Bin 0 -> 30208 bytes
.../resources/test-documents/testMSG_Contact.msg | Bin 0 -> 29184 bytes
.../test/resources/test-documents/testMSG_Post.msg | Bin 0 -> 21504 bytes
.../test-documents/testMSG_StickyNote.msg | Bin 0 -> 10240 bytes
.../test/resources/test-documents/testMSG_Task.msg | Bin 0 -> 19968 bytes
9 files changed, 64 insertions(+), 1 deletion(-)
diff --git
a/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
b/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
index b27dd20..50c5053 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
@@ -394,10 +394,19 @@ public class BatchProcessDriverCLI {
public static void main(String[] args) throws Exception {
+ final BatchProcessDriverCLI runner = new BatchProcessDriverCLI(args);
+
+ //make absolutely certain that the child process is killed
+ Runtime.getRuntime().addShutdownHook(new Thread() {
+ @Override
+ public void run() {
+ runner.stop();
+ }
+ });
- BatchProcessDriverCLI runner = new BatchProcessDriverCLI(args);
runner.execute();
System.out.println("FSBatchProcessDriver has gracefully completed");
System.exit(0);
}
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index eeaaa4f..2860487 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -122,4 +122,12 @@ public interface Office {
*/
Property OBJECT_COUNT = Property.internalInteger(
PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER +
"object-count");
+
+ /**
+ * MAPI message class. What type of .msg/MAPI file is it?
+ */
+ Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise(
+ PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER +
"mapi-message-class",
+ "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK",
"UNKNOWN", "UNSPECIFIED" );
+
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index d22379d..1186eff 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -54,6 +54,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -111,6 +112,9 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
try {
msg.setReturnNullOnMissingChunk(true);
+ String messageClass = getMessageClass(msg);
+ metadata.set(Office.MAPI_MESSAGE_CLASS, messageClass);
+
// If the message contains strings that aren't stored
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
@@ -304,6 +308,28 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
+ //TODO: replace this with getMessageClassEnum when we upgrad POI
+ private String getMessageClass(MAPIMessage msg) throws
ChunkNotFoundException {
+ String mc = msg.getMessageClass();
+ if (mc == null || mc.trim().length() == 0) {
+ return "UNSPECIFIED";
+ } else if (mc.equalsIgnoreCase("IPM.Note")) {
+ return "NOTE";
+ } else if (mc.equalsIgnoreCase("IPM.Contact")) {
+ return "CONTACT";
+ } else if (mc.equalsIgnoreCase("IPM.Appointment")) {
+ return "APPOINTMENT";
+ } else if (mc.equalsIgnoreCase("IPM.StickyNote")) {
+ return "STICKY_NOTE";
+ } else if (mc.equalsIgnoreCase("IPM.Task")) {
+ return "TASK";
+ } else if (mc.equalsIgnoreCase("IPM.Post")) {
+ return "POST";
+ } else {
+ return "UNKNOWN";
+ }
+ }
+
//As of 3.15, POI currently returns header[] by splitting on /\r?\n/
//this rebuilds headers that are broken up over several lines
//this also decodes encoded headers.
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index c15308f..86fd6c7 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -32,6 +32,7 @@ import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -251,4 +252,23 @@ public class OutlookParserTest extends TikaTest {
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
}
+
+ @Test
+ public void testMAPIMessageClasses() throws Exception {
+
+ for (String messageClass : new String[]{
+ "Appointment", "Contact", "Post", "StickyNote", "Task"
+ }) {
+ testMsgClass(messageClass,
+ getXML("testMSG_" + messageClass + ".msg").metadata);
+ }
+
+ testMsgClass("NOTE", getXML("test-outlook2003.msg").metadata);
+
+ }
+
+ private void testMsgClass(String expected, Metadata metadata) {
+ assertTrue(expected + ", but got: " +
metadata.get(Office.MAPI_MESSAGE_CLASS),
+
expected.equalsIgnoreCase(metadata.get(Office.MAPI_MESSAGE_CLASS).replaceAll("_",
"")));
+ }
}
diff --git
a/tika-parsers/src/test/resources/test-documents/testMSG_Appointment.msg
b/tika-parsers/src/test/resources/test-documents/testMSG_Appointment.msg
new file mode 100644
index 0000000..c124d31
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testMSG_Appointment.msg differ
diff --git a/tika-parsers/src/test/resources/test-documents/testMSG_Contact.msg
b/tika-parsers/src/test/resources/test-documents/testMSG_Contact.msg
new file mode 100644
index 0000000..d925f3d
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testMSG_Contact.msg differ
diff --git a/tika-parsers/src/test/resources/test-documents/testMSG_Post.msg
b/tika-parsers/src/test/resources/test-documents/testMSG_Post.msg
new file mode 100644
index 0000000..3dffd09
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testMSG_Post.msg differ
diff --git
a/tika-parsers/src/test/resources/test-documents/testMSG_StickyNote.msg
b/tika-parsers/src/test/resources/test-documents/testMSG_StickyNote.msg
new file mode 100644
index 0000000..13873a1
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testMSG_StickyNote.msg differ
diff --git a/tika-parsers/src/test/resources/test-documents/testMSG_Task.msg
b/tika-parsers/src/test/resources/test-documents/testMSG_Task.msg
new file mode 100644
index 0000000..a2ac9f8
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testMSG_Task.msg differ
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].