This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new 24160a1 TIKA-2281 add mapi message type
24160a1 is described below
commit 24160a1c036c4e9942622377cf9afc44483227d9
Author: tballison <[email protected]>
AuthorDate: Wed Mar 1 11:19:58 2017 -0500
TIKA-2281 add mapi message type
---
.../main/java/org/apache/tika/metadata/Office.java | 8 +++++++
.../tika/parser/microsoft/OutlookExtractor.java | 26 +++++++++++++++++++++
.../tika/parser/microsoft/OutlookParserTest.java | 20 ++++++++++++++++
.../test-documents/testMSG_Appointment.msg | Bin 0 -> 30208 bytes
.../resources/test-documents/testMSG_Contact.msg | Bin 0 -> 29184 bytes
.../test/resources/test-documents/testMSG_Post.msg | Bin 0 -> 21504 bytes
.../test-documents/testMSG_StickyNote.msg | Bin 0 -> 10240 bytes
.../test/resources/test-documents/testMSG_Task.msg | Bin 0 -> 19968 bytes
8 files changed, 54 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index eeaaa4f..2860487 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -122,4 +122,12 @@ public interface Office {
*/
Property OBJECT_COUNT = Property.internalInteger(
PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER +
"object-count");
+
+ /**
+ * MAPI message class. What type of .msg/MAPI file is it?
+ */
+ Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise(
+ PREFIX_DOC_META + Metadata.NAMESPACE_PREFIX_DELIMITER +
"mapi-message-class",
+ "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK",
"UNKNOWN", "UNSPECIFIED" );
+
}
diff --git
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 614bb5b..5dfd126 100644
---
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -57,6 +57,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -116,6 +117,9 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
try {
msg.setReturnNullOnMissingChunk(true);
+ String messageClass = getMessageClass(msg);
+ metadata.set(Office.MAPI_MESSAGE_CLASS, messageClass);
+
// If the message contains strings that aren't stored
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
@@ -313,6 +317,28 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
+ //TODO: replace this with getMessageClassEnum when we upgrad POI
+ private String getMessageClass(MAPIMessage msg) throws
ChunkNotFoundException {
+ String mc = msg.getMessageClass();
+ if (mc == null || mc.trim().length() == 0) {
+ return "UNSPECIFIED";
+ } else if (mc.equalsIgnoreCase("IPM.Note")) {
+ return "NOTE";
+ } else if (mc.equalsIgnoreCase("IPM.Contact")) {
+ return "CONTACT";
+ } else if (mc.equalsIgnoreCase("IPM.Appointment")) {
+ return "APPOINTMENT";
+ } else if (mc.equalsIgnoreCase("IPM.StickyNote")) {
+ return "STICKY_NOTE";
+ } else if (mc.equalsIgnoreCase("IPM.Task")) {
+ return "TASK";
+ } else if (mc.equalsIgnoreCase("IPM.Post")) {
+ return "POST";
+ } else {
+ return "UNKNOWN";
+ }
+ }
+
//As of 3.15, POI currently returns header[] by splitting on /\r?\n/
//this rebuilds headers that are broken up over several lines
//this also decodes encoded headers.
diff --git
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index c15308f..86fd6c7 100644
---
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -32,6 +32,7 @@ import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -251,4 +252,23 @@ public class OutlookParserTest extends TikaTest {
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
}
+
+ @Test
+ public void testMAPIMessageClasses() throws Exception {
+
+ for (String messageClass : new String[]{
+ "Appointment", "Contact", "Post", "StickyNote", "Task"
+ }) {
+ testMsgClass(messageClass,
+ getXML("testMSG_" + messageClass + ".msg").metadata);
+ }
+
+ testMsgClass("NOTE", getXML("test-outlook2003.msg").metadata);
+
+ }
+
+ private void testMsgClass(String expected, Metadata metadata) {
+ assertTrue(expected + ", but got: " +
metadata.get(Office.MAPI_MESSAGE_CLASS),
+
expected.equalsIgnoreCase(metadata.get(Office.MAPI_MESSAGE_CLASS).replaceAll("_",
"")));
+ }
}
diff --git
a/tika-test-resources/src/test/resources/test-documents/testMSG_Appointment.msg
b/tika-test-resources/src/test/resources/test-documents/testMSG_Appointment.msg
new file mode 100644
index 0000000..c124d31
Binary files /dev/null and
b/tika-test-resources/src/test/resources/test-documents/testMSG_Appointment.msg
differ
diff --git
a/tika-test-resources/src/test/resources/test-documents/testMSG_Contact.msg
b/tika-test-resources/src/test/resources/test-documents/testMSG_Contact.msg
new file mode 100644
index 0000000..d925f3d
Binary files /dev/null and
b/tika-test-resources/src/test/resources/test-documents/testMSG_Contact.msg
differ
diff --git
a/tika-test-resources/src/test/resources/test-documents/testMSG_Post.msg
b/tika-test-resources/src/test/resources/test-documents/testMSG_Post.msg
new file mode 100644
index 0000000..3dffd09
Binary files /dev/null and
b/tika-test-resources/src/test/resources/test-documents/testMSG_Post.msg differ
diff --git
a/tika-test-resources/src/test/resources/test-documents/testMSG_StickyNote.msg
b/tika-test-resources/src/test/resources/test-documents/testMSG_StickyNote.msg
new file mode 100644
index 0000000..13873a1
Binary files /dev/null and
b/tika-test-resources/src/test/resources/test-documents/testMSG_StickyNote.msg
differ
diff --git
a/tika-test-resources/src/test/resources/test-documents/testMSG_Task.msg
b/tika-test-resources/src/test/resources/test-documents/testMSG_Task.msg
new file mode 100644
index 0000000..a2ac9f8
Binary files /dev/null and
b/tika-test-resources/src/test/resources/test-documents/testMSG_Task.msg differ
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].