Repository: tika
Updated Branches:
  refs/heads/2.x ce4e7e7d9 -> cd98c4cf3


TIKA-2238   add mime detection for embedded MSEquation files


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/cd98c4cf
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/cd98c4cf
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/cd98c4cf

Branch: refs/heads/2.x
Commit: cd98c4cf3c6f9af72cd9aae6aaf063343be4b7d7
Parents: ce4e7e7
Author: tballison <talli...@mitre.org>
Authored: Fri Jan 13 12:02:59 2017 -0500
Committer: tballison <talli...@mitre.org>
Committed: Fri Jan 13 12:02:59 2017 -0500

----------------------------------------------------------------------
 CHANGES.txt                                        |   2 ++
 .../parser/microsoft/POIFSContainerDetector.java   |   8 ++++++++
 .../microsoft/POIContainerExtractionTest.java      |   8 ++++++++
 .../testMSEquation-govdocs-863534.doc              | Bin 0 -> 30720 bytes
 4 files changed, 18 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/cd98c4cf/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index b0a8abc..fc4ef6d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,8 @@ Release 2.0 - ???
 
 Release 1.15 -???
 
+  * Add container detection for embedded MSEquation files (TIKA-2238).
+
   * Add parsing of JBIG2 and extraction of JBIG2 from PDFs when
     required dependencies are added to class path by user.
     Contributed by Pascal Essiembre (TIKA-2232).

http://git-wip-us.apache.org/repos/asf/tika/blob/cd98c4cf/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 992692f..703f269 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -72,6 +72,12 @@ public class POIFSContainerDetector implements Detector {
      * Graph/Charts embedded in PowerPoint and Excel
      */
     public static final MediaType MS_GRAPH_CHART = application("vnd.ms-graph");
+
+    /**
+     * Equation embedded in Office docs
+     */
+    public static final MediaType MS_EQUATION = application("vnd.ms-equation");
+
     /**
      * Microsoft Excel
      */
@@ -300,6 +306,8 @@ public class POIFSContainerDetector implements Detector {
                 }
             } else if (names.contains("NativeContent_MAIN")) {
                 return new MediaType(QUATTROPRO, "version", "9"); // .qpw
+            } else if (names.contains("Equation Native")) {
+                return MS_EQUATION;
             } else {
                 for (String name : names) {
                     if (name.startsWith("__substg1.0_")) {

http://git-wip-us.apache.org/repos/asf/tika/blob/cd98c4cf/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index b59bb00..b3a230f 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -379,4 +379,12 @@ public class POIContainerExtractionTest extends 
AbstractPOIContainerExtractionTe
             assertTrue("didn't find chart in "+suffix, found);
         }
     }
+
+    @Test
+    public void testEmbeddedEquation() throws Exception {
+        //file derives from govdocs1 863534.doc
+        List<Metadata> metadataList = 
getRecursiveMetadata("testMSEquation-govdocs-863534.doc");
+        assertEquals(3, metadataList.size());
+        assertEquals("application/vnd.ms-equation", 
metadataList.get(2).get(Metadata.CONTENT_TYPE));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/cd98c4cf/tika-test-resources/src/test/resources/test-documents/testMSEquation-govdocs-863534.doc
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testMSEquation-govdocs-863534.doc
 
b/tika-test-resources/src/test/resources/test-documents/testMSEquation-govdocs-863534.doc
new file mode 100644
index 0000000..bede30e
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testMSEquation-govdocs-863534.doc
 differ

Reply via email to