Repository: tika Updated Branches: refs/heads/master 2ae7206d9 -> 8a45f67a2
TIKA-2069 -- extract macros from MSOffice docs, fix tests to find target metadata object in any order Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/8a45f67a Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/8a45f67a Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/8a45f67a Branch: refs/heads/master Commit: 8a45f67a2e3641b08fcfb5e2283e4a43ff86f3cd Parents: 2ae7206 Author: tballison <[email protected]> Authored: Thu Sep 22 08:59:53 2016 -0400 Committer: tballison <[email protected]> Committed: Thu Sep 22 08:59:53 2016 -0400 ---------------------------------------------------------------------- .../src/test/java/org/apache/tika/TikaTest.java | 38 +++++++++++++++++ .../tika/parser/microsoft/ExcelParserTest.java | 17 ++++---- .../parser/microsoft/PowerPointParserTest.java | 18 +++++--- .../tika/parser/microsoft/WordParserTest.java | 15 ++++--- .../parser/microsoft/ooxml/OOXMLParserTest.java | 45 +++++++++++--------- 5 files changed, 91 insertions(+), 42 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-core/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index a699ac8..690db33 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -99,6 +99,44 @@ public abstract class TikaTest { assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle)); } + /** + * Test that in at least one item in metadataList, all keys and values + * in minExpected are contained. + * <p> + * The values in minExpected are tested for whether they are contained + * within a value in the target. If minExpected=&dquot;text/vbasic&dquot; and + * what was actually found in the target within metadatalist is + * &dquot;text/vbasic; charset=windows-1252&dquot;, + * that is counted as a hit. + * + * @param minExpected + * @param metadataList + */ + public static void assertContainsAtLeast(Metadata minExpected, List<Metadata> metadataList) { + + for (Metadata m : metadataList) { + int foundPropertyCount = 0; + for (String n : minExpected.names()) { + int foundValCount = 0; + for (String foundVal : m.getValues(n)) { + for (String expectedVal : minExpected.getValues(n)) { + if (foundVal.contains(expectedVal)) { + foundValCount++; + } + } + } + if (foundValCount == minExpected.getValues(n).length) { + foundPropertyCount++; + } + } + if (foundPropertyCount == minExpected.names().length) { + //found everything! + return; + } + } + //TODO: figure out how to have more informative error message + fail("Couldn't find everything within a single metadata item"); + } protected static class XMLResult { public final String xml; public final Metadata metadata; http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index eb1a814..db137e0 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -474,14 +474,15 @@ public class ExcelParserTest extends TikaTest { } @Test - public void testMacroinXls() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xls"); - Metadata macroMetadata = metadataList.get(1); - assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), - macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + public void testMacros() throws Exception { + Metadata minExpected = new Metadata(); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()"); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt"); + minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic"); + minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + + assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls")); } } http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java index 41400c5..41c5077 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java @@ -254,12 +254,18 @@ public class PowerPointParserTest extends TikaTest { @Test @Ignore("POI 3.15-final not finding any macros in this ppt") public void testMacros() throws Exception { + Metadata minExpected = new Metadata(); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); + minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic"); + minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt"); - Metadata macroMetadata = metadataList.get(1); - assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), - macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertContainsAtLeast(minExpected, metadataList); } + + + + } http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java index e63a61b..bfb7ca1 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java @@ -524,14 +524,15 @@ public class WordParserTest extends TikaTest { @Test public void testMacros() throws Exception { - //debug(getRecursiveMetadata("SimpleMacro.doc")); + Metadata minExpected = new Metadata(); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); + minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic"); + minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc"); - Metadata macroMetadata = metadataList.get(1); - assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), - macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertContainsAtLeast(minExpected, metadataList); } } http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index ccfb293..5e0fc1e 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -1265,35 +1265,38 @@ public class OOXMLParserTest extends TikaTest { @Test public void testMacrosInDocm() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm"); - Metadata macroMetadata = metadataList.get(1); - assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), - macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + Metadata minExpected = new Metadata(); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); + minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic"); + minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + + assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm")); } @Test public void testMacrosInPptm() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm"); - Metadata macroMetadata = metadataList.get(1); - assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), - macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + Metadata minExpected = new Metadata(); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); + minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic"); + minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + + assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm")); } @Test public void testMacroinXlsm() throws Exception { - List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xlsm"); - Metadata macroMetadata = metadataList.get(1); - assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); - assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), - macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + Metadata minExpected = new Metadata(); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()"); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt"); + minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic"); + minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + + assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm")); } }
