Author: nick Date: Fri Nov 25 14:36:03 2011 New Revision: 1206193 URL: http://svn.apache.org/viewvc?rev=1206193&view=rev Log: TIKA-789 POIFS Container Detection support for MPP files
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1206193&r1=1206192&r2=1206193&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Fri Nov 25 14:36:03 2011 @@ -24,6 +24,7 @@ import java.nio.channels.FileChannel; import java.util.Collections; import java.util.HashSet; import java.util.Set; +import java.util.regex.Pattern; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; @@ -67,7 +68,14 @@ public class POIFSContainerDetector impl /** Microsoft Outlook */ public static final MediaType MSG = application("vnd.ms-outlook"); - + + /** Microsoft Project */ + public static final MediaType MPP = application("vnd.ms-project"); + + /** Regexp for matching the MPP Project Properties stream */ + private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+"); + private static final Pattern mppPropsMatch = Pattern.compile("Props\\d+"); + public MediaType detect(InputStream input, Metadata metadata) throws IOException { // Check if we have access to the document @@ -134,6 +142,17 @@ public class POIFSContainerDetector impl // of embedded non-office file inside an OLE2 document // This is most commonly triggered on nested directories return OLE; + } else if (names.contains("\u0001CompObj")) { + // Could be Project, look for common name patterns + boolean matchedProps = false; + boolean matchedData = false; + for (String name : names) { + if (mppDataMatch.matcher(name).matches()) matchedData = true; + if (mppPropsMatch.matcher(name).matches()) matchedProps = true; + } + if (matchedProps && matchedData) { + return MPP; + } } else if (names.contains("\u0001Ole10Native")) { return OLE; } else if (names.contains("PerfectOffice_MAIN")) { Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1206193&r1=1206192&r2=1206193&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Fri Nov 25 14:36:03 2011 @@ -63,10 +63,17 @@ public class TestContainerAwareDetector assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel"); assertTypeByData("testWORD.doc", "application/msword"); assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint"); - - // Try some ones that POI doesn't handle, that are still OLE2 based + + assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook"); + assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook"); + assertTypeByData("testVISIO.vsd", "application/vnd.visio"); + assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher"); assertTypeByData("testWORKS.wps", "application/vnd.ms-works"); assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works"); + assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project"); + assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project"); + + // Try some ones that POI doesn't handle, that are still OLE2 based assertTypeByData("testCOREL.shw", "application/x-corelpresentations"); assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro"); assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro");