Author: nick
Date: Fri Nov 25 14:36:03 2011
New Revision: 1206193

URL: http://svn.apache.org/viewvc?rev=1206193&view=rev
Log:
TIKA-789 POIFS Container Detection support for MPP files

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1206193&r1=1206192&r2=1206193&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 Fri Nov 25 14:36:03 2011
@@ -24,6 +24,7 @@ import java.nio.channels.FileChannel;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
@@ -67,7 +68,14 @@ public class POIFSContainerDetector impl
 
     /** Microsoft Outlook */
     public static final MediaType MSG = application("vnd.ms-outlook");
-
+    
+    /** Microsoft Project */
+    public static final MediaType MPP = application("vnd.ms-project");
+
+    /** Regexp for matching the MPP Project Properties stream */
+    private static final Pattern mppDataMatch = 
Pattern.compile("\\s\\s\\s\\d+");
+    private static final Pattern mppPropsMatch = Pattern.compile("Props\\d+");
+    
     public MediaType detect(InputStream input, Metadata metadata)
              throws IOException {
         // Check if we have access to the document
@@ -134,6 +142,17 @@ public class POIFSContainerDetector impl
                //  of embedded non-office file inside an OLE2 document
                // This is most commonly triggered on nested directories
                return OLE;
+            } else if (names.contains("\u0001CompObj")) {
+               // Could be Project, look for common name patterns
+               boolean matchedProps = false;
+               boolean matchedData = false;
+               for (String name : names) {
+                  if (mppDataMatch.matcher(name).matches()) matchedData = true;
+                  if (mppPropsMatch.matcher(name).matches()) matchedProps = 
true;
+               }
+               if (matchedProps && matchedData) {
+                  return MPP;
+               }
             } else if (names.contains("\u0001Ole10Native")) {
                 return OLE;
             } else if (names.contains("PerfectOffice_MAIN")) {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1206193&r1=1206192&r2=1206193&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Fri Nov 25 14:36:03 2011
@@ -63,10 +63,17 @@ public class TestContainerAwareDetector 
         assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel");
         assertTypeByData("testWORD.doc", "application/msword");
         assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint");
-
-        // Try some ones that POI doesn't handle, that are still OLE2 based
+        
+        assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook");
+        assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook");
+        assertTypeByData("testVISIO.vsd", "application/vnd.visio");
+        assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
         assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
         assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
+        assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
+        assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
+
+        // Try some ones that POI doesn't handle, that are still OLE2 based
         assertTypeByData("testCOREL.shw", "application/x-corelpresentations");
         assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro");
         assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro");


Reply via email to