Author: nick
Date: Fri Nov 25 14:36:03 2011
New Revision: 1206193
URL: http://svn.apache.org/viewvc?rev=1206193view=rev
Log:
TIKA-789 POIFS Container Detection support for MPP files
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1206193r1=1206192r2=1206193view=diff
==
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Fri Nov 25 14:36:03 2011
@@ -24,6 +24,7 @@ import java.nio.channels.FileChannel;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
+import java.util.regex.Pattern;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
@@ -67,7 +68,14 @@ public class POIFSContainerDetector impl
/** Microsoft Outlook */
public static final MediaType MSG = application(vnd.ms-outlook);
-
+
+/** Microsoft Project */
+public static final MediaType MPP = application(vnd.ms-project);
+
+/** Regexp for matching the MPP Project Properties stream */
+private static final Pattern mppDataMatch =
Pattern.compile(\\s\\s\\s\\d+);
+private static final Pattern mppPropsMatch = Pattern.compile(Props\\d+);
+
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
// Check if we have access to the document
@@ -134,6 +142,17 @@ public class POIFSContainerDetector impl
// of embedded non-office file inside an OLE2 document
// This is most commonly triggered on nested directories
return OLE;
+} else if (names.contains(\u0001CompObj)) {
+ // Could be Project, look for common name patterns
+ boolean matchedProps = false;
+ boolean matchedData = false;
+ for (String name : names) {
+ if (mppDataMatch.matcher(name).matches()) matchedData = true;
+ if (mppPropsMatch.matcher(name).matches()) matchedProps =
true;
+ }
+ if (matchedProps matchedData) {
+ return MPP;
+ }
} else if (names.contains(\u0001Ole10Native)) {
return OLE;
} else if (names.contains(PerfectOffice_MAIN)) {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1206193r1=1206192r2=1206193view=diff
==
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Fri Nov 25 14:36:03 2011
@@ -63,10 +63,17 @@ public class TestContainerAwareDetector
assertTypeByData(testEXCEL.xls, application/vnd.ms-excel);
assertTypeByData(testWORD.doc, application/msword);
assertTypeByData(testPPT.ppt, application/vnd.ms-powerpoint);
-
-// Try some ones that POI doesn't handle, that are still OLE2 based
+
+assertTypeByData(test-outlook.msg, application/vnd.ms-outlook);
+assertTypeByData(test-outlook2003.msg, application/vnd.ms-outlook);
+assertTypeByData(testVISIO.vsd, application/vnd.visio);
+assertTypeByData(testPUBLISHER.pub, application/x-mspublisher);
assertTypeByData(testWORKS.wps, application/vnd.ms-works);
assertTypeByData(testWORKS2000.wps, application/vnd.ms-works);
+assertTypeByData(testPROJECT2003.mpp, application/vnd.ms-project);
+assertTypeByData(testPROJECT2007.mpp, application/vnd.ms-project);
+
+// Try some ones that POI doesn't handle, that are still OLE2 based
assertTypeByData(testCOREL.shw, application/x-corelpresentations);
assertTypeByData(testQUATTRO.qpw, application/x-quattro-pro);
assertTypeByData(testQUATTRO.wb3, application/x-quattro-pro);