svn commit: r1206185 - in /tika/trunk/tika-parsers/src/test/resources/test-documents: testMPP2003.mpp testMPP2007.mpp

2011-11-25 Thread nick
Author: nick
Date: Fri Nov 25 14:19:23 2011
New Revision: 1206185

URL: http://svn.apache.org/viewvc?rev=1206185view=rev
Log:
TIKA-789 Sample Microsoft Project (MPP) files

Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testMPP2003.mpp   
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testMPP2007.mpp   
(with props)

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMPP2003.mpp
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMPP2003.mpp?rev=1206185view=auto
==
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testMPP2003.mpp
--
svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMPP2007.mpp
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMPP2007.mpp?rev=1206185view=auto
==
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testMPP2007.mpp
--
svn:mime-type = application/octet-stream




svn commit: r1206193 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java test/java/org/apache/tika/detect/TestContainerAwareDetector.java

2011-11-25 Thread nick
Author: nick
Date: Fri Nov 25 14:36:03 2011
New Revision: 1206193

URL: http://svn.apache.org/viewvc?rev=1206193view=rev
Log:
TIKA-789 POIFS Container Detection support for MPP files

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java

tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1206193r1=1206192r2=1206193view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 Fri Nov 25 14:36:03 2011
@@ -24,6 +24,7 @@ import java.nio.channels.FileChannel;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
@@ -67,7 +68,14 @@ public class POIFSContainerDetector impl
 
 /** Microsoft Outlook */
 public static final MediaType MSG = application(vnd.ms-outlook);
-
+
+/** Microsoft Project */
+public static final MediaType MPP = application(vnd.ms-project);
+
+/** Regexp for matching the MPP Project Properties stream */
+private static final Pattern mppDataMatch = 
Pattern.compile(\\s\\s\\s\\d+);
+private static final Pattern mppPropsMatch = Pattern.compile(Props\\d+);
+
 public MediaType detect(InputStream input, Metadata metadata)
  throws IOException {
 // Check if we have access to the document
@@ -134,6 +142,17 @@ public class POIFSContainerDetector impl
//  of embedded non-office file inside an OLE2 document
// This is most commonly triggered on nested directories
return OLE;
+} else if (names.contains(\u0001CompObj)) {
+   // Could be Project, look for common name patterns
+   boolean matchedProps = false;
+   boolean matchedData = false;
+   for (String name : names) {
+  if (mppDataMatch.matcher(name).matches()) matchedData = true;
+  if (mppPropsMatch.matcher(name).matches()) matchedProps = 
true;
+   }
+   if (matchedProps  matchedData) {
+  return MPP;
+   }
 } else if (names.contains(\u0001Ole10Native)) {
 return OLE;
 } else if (names.contains(PerfectOffice_MAIN)) {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1206193r1=1206192r2=1206193view=diff
==
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 Fri Nov 25 14:36:03 2011
@@ -63,10 +63,17 @@ public class TestContainerAwareDetector 
 assertTypeByData(testEXCEL.xls, application/vnd.ms-excel);
 assertTypeByData(testWORD.doc, application/msword);
 assertTypeByData(testPPT.ppt, application/vnd.ms-powerpoint);
-
-// Try some ones that POI doesn't handle, that are still OLE2 based
+
+assertTypeByData(test-outlook.msg, application/vnd.ms-outlook);
+assertTypeByData(test-outlook2003.msg, application/vnd.ms-outlook);
+assertTypeByData(testVISIO.vsd, application/vnd.visio);
+assertTypeByData(testPUBLISHER.pub, application/x-mspublisher);
 assertTypeByData(testWORKS.wps, application/vnd.ms-works);
 assertTypeByData(testWORKS2000.wps, application/vnd.ms-works);
+assertTypeByData(testPROJECT2003.mpp, application/vnd.ms-project);
+assertTypeByData(testPROJECT2007.mpp, application/vnd.ms-project);
+
+// Try some ones that POI doesn't handle, that are still OLE2 based
 assertTypeByData(testCOREL.shw, application/x-corelpresentations);
 assertTypeByData(testQUATTRO.qpw, application/x-quattro-pro);
 assertTypeByData(testQUATTRO.wb3, application/x-quattro-pro);




svn commit: r1206225 - /tika/trunk/CHANGES.txt

2011-11-25 Thread nick
Author: nick
Date: Fri Nov 25 15:38:32 2011
New Revision: 1206225

URL: http://svn.apache.org/viewvc?rev=1206225view=rev
Log:
Add CHANGES entry for TIKA-789

Modified:
tika/trunk/CHANGES.txt

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1206225r1=1206224r2=1206225view=diff
==
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Nov 25 15:38:32 2011
@@ -34,6 +34,9 @@ Release 1.1 - Current Development
allows for specific, detailed detectors to take preference over
the default mime magic + filename detector. (TIKA-786)
 
+ * Microsoft Project (MPP): Filetype detection has been fixed,
+   and basic metadata (but no text) is now extracted. (TIKA-789)
+
 Release 1.0 - 11/4/2011
 -
 




svn commit: r1206228 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

2011-11-25 Thread nick
Author: nick
Date: Fri Nov 25 15:43:05 2011
New Revision: 1206228

URL: http://svn.apache.org/viewvc?rev=1206228view=rev
Log:
TIKA-789 Add the project type to the OfficeParser mimetype list, and add a note 
on why Works is missing from the list

Modified:

tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1206228r1=1206227r2=1206228view=diff
==
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Fri Nov 25 15:43:05 2011
@@ -68,7 +68,9 @@ public class OfficeParser extends Abstra
 POIFSDocumentType.ENCRYPTED.type,
 POIFSDocumentType.POWERPOINT.type,
 POIFSDocumentType.PUBLISHER.type,
+POIFSDocumentType.PROJECT.type,
 POIFSDocumentType.VISIO.type,
+// Works isn't supported
 POIFSDocumentType.OUTLOOK.type,
 
MediaType.application(vnd.ms-excel.sheet.binary.macroenabled.12)
 )));