Repository: tika
Updated Branches:
  refs/heads/2.x 66f433471 -> d543378a8


TIKA-2069 -- extract macros from MSOffice docs, fix tests to find target 
metadata object in any order


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d543378a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d543378a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d543378a

Branch: refs/heads/2.x
Commit: d543378a88aeca574d15ab31d13b6316fb938f7f
Parents: 66f4334
Author: tballison <talli...@mitre.org>
Authored: Thu Sep 22 09:04:40 2016 -0400
Committer: tballison <talli...@mitre.org>
Committed: Thu Sep 22 09:04:40 2016 -0400

----------------------------------------------------------------------
 .../src/test/java/org/apache/tika/TikaTest.java | 38 +++++++++++++++++
 .../tika/parser/microsoft/ExcelParserTest.java  | 18 ++++----
 .../parser/microsoft/PowerPointParserTest.java  | 15 ++++---
 .../tika/parser/microsoft/WordParserTest.java   | 15 ++++---
 .../parser/microsoft/ooxml/OOXMLParserTest.java | 45 +++++++++++---------
 5 files changed, 88 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d543378a/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index f691a3a..847eb79 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -122,6 +122,44 @@ public abstract class TikaTest {
         assertFalse(needle + " unexpectedly found in:\n" + haystack, 
haystack.contains(needle));
     }
 
+    /**
+     * Test that in at least one item in metadataList, all keys and values
+     * in minExpected are contained.
+     * <p>
+     * The values in minExpected are tested for whether they are contained
+     * within a value in the target.  If minExpected=&dquot;text/vbasic&dquot; 
 and
+     * what was actually found in the target within metadatalist is
+     * &dquot;text/vbasic; charset=windows-1252&dquot;,
+     * that is counted as a hit.
+     *
+     * @param minExpected
+     * @param metadataList
+     */
+    public static void assertContainsAtLeast(Metadata minExpected, 
List<Metadata> metadataList) {
+
+        for (Metadata m : metadataList) {
+            int foundPropertyCount = 0;
+            for (String n : minExpected.names()) {
+                int foundValCount = 0;
+                for (String foundVal : m.getValues(n)) {
+                    for (String expectedVal : minExpected.getValues(n)) {
+                        if (foundVal.contains(expectedVal)) {
+                            foundValCount++;
+                        }
+                    }
+                }
+                if (foundValCount == minExpected.getValues(n).length) {
+                    foundPropertyCount++;
+                }
+            }
+            if (foundPropertyCount == minExpected.names().length) {
+                //found everything!
+                return;
+            }
+        }
+        //TODO: figure out how to have more informative error message
+        fail("Couldn't find everything within a single metadata item");
+    }
     protected static class XMLResult {
         public final String xml;
         public final Metadata metadata;

http://git-wip-us.apache.org/repos/asf/tika/blob/d543378a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index b136a5d..94e23c1 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -421,14 +421,14 @@ public class ExcelParserTest extends TikaTest {
         assertContains("1.23456789012345E15", xml);//16 digit formula, ditto
     }
 
-    @Test
-    public void testMacroinXls() throws Exception {
-        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_macro.xls");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Dirty()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("dirty dirt dirt", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    public void testMacros() throws  Exception {
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Dirty()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty 
dirt dirt");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testEXCEL_macro.xls"));
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/d543378a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 7e68ce8..4eabff2 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -254,12 +254,15 @@ public class PowerPointParserTest extends TikaTest {
     @Test
     @Ignore("POI 3.15-final not finding any macros in this ppt")
     public void testMacros() throws  Exception {
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
         List<Metadata> metadataList = 
getRecursiveMetadata("testPPT_macros.ppt");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Embolden()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("Sub Italicize()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        assertContainsAtLeast(minExpected, metadataList);
     }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/d543378a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index e63a61b..bfb7ca1 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -524,14 +524,15 @@ public class WordParserTest extends TikaTest {
 
     @Test
     public void testMacros() throws  Exception {
-        //debug(getRecursiveMetadata("SimpleMacro.doc"));
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
         List<Metadata> metadataList = 
getRecursiveMetadata("testWORD_macros.doc");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Embolden()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("Sub Italicize()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        assertContainsAtLeast(minExpected, metadataList);
     }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/d543378a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 3c67397..d924f41 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1265,35 +1265,38 @@ public class OOXMLParserTest extends TikaTest {
 
     @Test
     public void testMacrosInDocm() throws Exception {
-        List<Metadata> metadataList = 
getRecursiveMetadata("testWORD_macros.docm");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Embolden()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("Sub Italicize()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testWORD_macros.docm"));
     }
 
     @Test
     public void testMacrosInPptm() throws Exception {
-        List<Metadata> metadataList = 
getRecursiveMetadata("testPPT_macros.pptm");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Embolden()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("Sub Italicize()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testPPT_macros.pptm"));
     }
 
     @Test
     public void testMacroinXlsm() throws Exception {
-        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_macro.xlsm");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Dirty()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("dirty dirt dirt", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Dirty()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty 
dirt dirt");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testEXCEL_macro.xlsm"));
     }
 
 }

Reply via email to