Repository: tika Updated Branches: refs/heads/2.x ce4e7e7d9 -> cd98c4cf3
TIKA-2238 add mime detection for embedded MSEquation files Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/cd98c4cf Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/cd98c4cf Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/cd98c4cf Branch: refs/heads/2.x Commit: cd98c4cf3c6f9af72cd9aae6aaf063343be4b7d7 Parents: ce4e7e7 Author: tballison <talli...@mitre.org> Authored: Fri Jan 13 12:02:59 2017 -0500 Committer: tballison <talli...@mitre.org> Committed: Fri Jan 13 12:02:59 2017 -0500 ---------------------------------------------------------------------- CHANGES.txt | 2 ++ .../parser/microsoft/POIFSContainerDetector.java | 8 ++++++++ .../microsoft/POIContainerExtractionTest.java | 8 ++++++++ .../testMSEquation-govdocs-863534.doc | Bin 0 -> 30720 bytes 4 files changed, 18 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/cd98c4cf/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index b0a8abc..fc4ef6d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,8 @@ Release 2.0 - ??? Release 1.15 -??? + * Add container detection for embedded MSEquation files (TIKA-2238). + * Add parsing of JBIG2 and extraction of JBIG2 from PDFs when required dependencies are added to class path by user. Contributed by Pascal Essiembre (TIKA-2232). http://git-wip-us.apache.org/repos/asf/tika/blob/cd98c4cf/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java index 992692f..703f269 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java @@ -72,6 +72,12 @@ public class POIFSContainerDetector implements Detector { * Graph/Charts embedded in PowerPoint and Excel */ public static final MediaType MS_GRAPH_CHART = application("vnd.ms-graph"); + + /** + * Equation embedded in Office docs + */ + public static final MediaType MS_EQUATION = application("vnd.ms-equation"); + /** * Microsoft Excel */ @@ -300,6 +306,8 @@ public class POIFSContainerDetector implements Detector { } } else if (names.contains("NativeContent_MAIN")) { return new MediaType(QUATTROPRO, "version", "9"); // .qpw + } else if (names.contains("Equation Native")) { + return MS_EQUATION; } else { for (String name : names) { if (name.startsWith("__substg1.0_")) { http://git-wip-us.apache.org/repos/asf/tika/blob/cd98c4cf/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java index b59bb00..b3a230f 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java @@ -379,4 +379,12 @@ public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTe assertTrue("didn't find chart in "+suffix, found); } } + + @Test + public void testEmbeddedEquation() throws Exception { + //file derives from govdocs1 863534.doc + List<Metadata> metadataList = getRecursiveMetadata("testMSEquation-govdocs-863534.doc"); + assertEquals(3, metadataList.size()); + assertEquals("application/vnd.ms-equation", metadataList.get(2).get(Metadata.CONTENT_TYPE)); + } } http://git-wip-us.apache.org/repos/asf/tika/blob/cd98c4cf/tika-test-resources/src/test/resources/test-documents/testMSEquation-govdocs-863534.doc ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/resources/test-documents/testMSEquation-govdocs-863534.doc b/tika-test-resources/src/test/resources/test-documents/testMSEquation-govdocs-863534.doc new file mode 100644 index 0000000..bede30e Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testMSEquation-govdocs-863534.doc differ