This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 19c0e916982174da20ee98196db840c7465471eb
Author: tballison <[email protected]>
AuthorDate: Mon Mar 27 09:46:49 2017 -0400

    TIKA-2302 -- make extraction of macros optional in OfficeParsers and set 
default to false
---
 CHANGES.txt                                        |  3 +
 .../parser/microsoft/AbstractOfficeParser.java     | 12 ++++
 .../apache/tika/parser/microsoft/OfficeParser.java | 10 +++-
 .../tika/parser/microsoft/OfficeParserConfig.java  | 19 ++++++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    | 18 ++++--
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  8 ++-
 .../tika/parser/microsoft/ExcelParserTest.java     | 22 ++++++-
 .../parser/microsoft/PowerPointParserTest.java     |  8 ++-
 .../tika/parser/microsoft/WordParserTest.java      | 26 +++++++-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    | 69 +++++++++++++++++++++-
 .../parser/microsoft/ooxml/SXSLFExtractorTest.java | 38 +++++++++++-
 .../parser/microsoft/ooxml/SXWPFExtractorTest.java | 36 ++++++++++-
 .../microsoft/ooxml/tika-config-dom-macros.xml     | 32 ++++++++++
 .../microsoft/ooxml/tika-config-sax-macros.xml     | 34 +++++++++++
 .../tika/parser/microsoft/tika-config-macros.xml   | 32 ++++++++++
 15 files changed, 347 insertions(+), 20 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8a4c01e..1fe98a7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.15 - ??
 
+  * Change default behavior of Office Parsers to _not_ extract
+    Macros.  User needs to setExtractMacros to "true" (TIKA-2302).
+
   * Unified logging across Tika: SLF4J as logging API, Apache Log4j as
     implementation with JCL and JUL bridges in standalone tools like
     tika-app, tika-batch and tika-server (TIKA-2245).
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index e01fe0c..48a756e 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -50,6 +50,13 @@ public abstract class AbstractOfficeParser extends 
AbstractParser {
         return defaultOfficeParserConfig.getUseSAXDocxExtractor();
     }
 
+    /**
+     * @see OfficeParserConfig#getExtractMacros()
+     * @return whether or not to extract macros
+     */
+    public boolean getExtractMacros() {
+        return defaultOfficeParserConfig.getExtractMacros();
+    }
 
     @Field
     public void setIncludeDeletedContent(boolean includeDeletedConent) {
@@ -70,4 +77,9 @@ public abstract class AbstractOfficeParser extends 
AbstractParser {
     public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
         defaultOfficeParserConfig.setUseSAXPptxExtractor(useSAXPptxExtractor);
     }
+
+    @Field
+    public void setExtractMacros(boolean extractMacros) {
+        defaultOfficeParserConfig.setExtractMacros(extractMacros);
+    }
 }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 7e21ba8..4bd3804 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -40,6 +40,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.poifs.macros.VBAMacroReader;
 import org.apache.poi.util.IOUtils;
+import org.apache.tika.config.Initializable;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -129,10 +130,13 @@ public class OfficeParser extends AbstractOfficeParser {
                 }
             }
             parse(root, context, metadata, xhtml);
+            OfficeParserConfig officeParserConfig = 
context.get(OfficeParserConfig.class);
 
-            //now try to get macros
-            extractMacros(root.getNFileSystem(), xhtml,
-                    
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+            if (officeParserConfig.getExtractMacros()) {
+                //now try to get macros
+                extractMacros(root.getNFileSystem(), xhtml,
+                        
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+            }
         } finally {
             IOUtils.closeQuietly(mustCloseFs);
         }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 05275d7..e1947a5 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -21,6 +21,8 @@ import java.io.Serializable;
 
 public class OfficeParserConfig implements Serializable {
 
+    private boolean extractMacros = false;
+
     private boolean includeDeletedContent = false;
     private boolean includeMoveFromContent = false;
 
@@ -28,6 +30,23 @@ public class OfficeParserConfig implements Serializable {
     private boolean useSAXPptxExtractor = false;
 
     /**
+     * Sets whether or not MSOffice parsers should extract macros.
+     * As of Tika 1.15, the default is <code>false</code>.
+     *
+     * @param extractMacros
+     */
+    public void setExtractMacros(boolean extractMacros) {
+        this.extractMacros = extractMacros;
+    }
+
+    /**
+     *
+     * @return whether or not to extract macros
+     */
+    public boolean getExtractMacros() {
+        return extractMacros;
+    }
+    /**
      * Sets whether or not the parser should include deleted content.
      * <p/>
      * <b>This has only been implemented in the streaming docx parser
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 426092e..26711b2 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -54,6 +54,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
@@ -91,9 +92,11 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
 
 
     private final EmbeddedDocumentExtractor embeddedExtractor;
+    private final ParseContext context;
     protected POIXMLTextExtractor extractor;
 
     public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor 
extractor) {
+        this.context = context;
         this.extractor = extractor;
         embeddedExtractor = 
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
     }
@@ -382,14 +385,17 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
 
 
     void handleMacros(PackagePart macroPart, ContentHandler handler) throws 
TikaException, SAXException {
+        OfficeParserConfig officeParserConfig = 
context.get(OfficeParserConfig.class);
 
-        try (InputStream is = macroPart.getInputStream()) {
-            try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
-                //Macro reading exceptions are already swallowed here
-                OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
+        if (officeParserConfig.getExtractMacros()) {
+            try (InputStream is = macroPart.getInputStream()) {
+                try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
+                    //Macro reading exceptions are already swallowed here
+                    OfficeParser.extractMacros(npoifs, handler, 
embeddedExtractor);
+                }
+            } catch (IOException e) {
+                throw new TikaException("Broken OOXML file", e);
             }
-        } catch (IOException e) {
-            throw new TikaException("Broken OOXML file", e);
         }
     }
 
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 3812bfa..86d74df 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -90,7 +90,9 @@ public class OOXMLExtractorFactory {
 
             // Have the appropriate OOXML text extractor picked
             POIXMLTextExtractor poiExtractor = null;
-            OfficeParserConfig config = context.get(OfficeParserConfig.class, 
new OfficeParserConfig());
+            //This has already been set by OOXMLParser's call to configure()
+            //We can rely on this being non-null.
+            OfficeParserConfig config = context.get(OfficeParserConfig.class);
             if (config.getUseSAXDocxExtractor()) {
                 poiExtractor = trySXWPF(pkg);
             }
@@ -109,11 +111,11 @@ public class OOXMLExtractorFactory {
             } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
                 extractor = new SXWPFWordExtractorDecorator(metadata, context,
                         (XWPFEventBasedWordExtractor) poiExtractor);
-                metadata.add("X-Parsed-By", 
XWPFEventBasedWordExtractor.class.getSimpleName());
+                metadata.add("X-Parsed-By", 
XWPFEventBasedWordExtractor.class.getCanonicalName());
             } else if (poiExtractor instanceof 
XSLFEventBasedPowerPointExtractor) {
                 extractor = new SXSLFPowerPointExtractorDecorator(metadata, 
context,
                         (XSLFEventBasedPowerPointExtractor) poiExtractor);
-                metadata.add("X-Parsed-By", 
XSLFEventBasedPowerPointExtractor.class.getSimpleName());
+                metadata.add("X-Parsed-By", 
XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
             } else if (document == null) {
                 throw new TikaException(
                         "Expecting UserModel based POI OOXML extractor with a 
document, but none found. " +
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index e6ba7cd..3efaa7c 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -27,6 +27,7 @@ import java.util.Locale;
 
 import org.apache.poi.util.LocaleUtil;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.EncryptedDocumentException;
@@ -480,6 +481,19 @@ public class ExcelParserTest extends TikaTest {
 
     @Test
     public void testMacros() throws  Exception {
+        //test default is "don't extract macros"
+        for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xls")) {
+            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+                fail("Shouldn't have extract macros as default");
+            }
+        }
+
+        //now test that they were extracted
+        ParseContext context = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Dirty()");
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty 
dirt dirt");
@@ -487,7 +501,13 @@ public class ExcelParserTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testEXCEL_macro.xls"));
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testEXCEL_macro.xls", context));
+
+        //test configuring via config file
+        TikaConfig tikaConfig = new 
TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testEXCEL_macro.xls", parser));
+
     }
 
 }
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 1cadec4..57335b6 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -261,7 +261,13 @@ public class PowerPointParserTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        List<Metadata> metadataList = 
getRecursiveMetadata("testPPT_macros.ppt");
+        ParseContext parseContext = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
+        parseContext.set(OfficeParserConfig.class, officeParserConfig);
+
+
+        List<Metadata> metadataList = 
getRecursiveMetadata("testPPT_macros.ppt", parseContext);
         assertContainsAtLeast(minExpected, metadataList);
     }
 
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index abb15c7..1dd8bbf 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -29,11 +29,13 @@ import java.util.Locale;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.BodyContentHandler;
@@ -525,6 +527,21 @@ public class WordParserTest extends TikaTest {
 
     @Test
     public void testMacros() throws  Exception {
+
+        //test default is "don't extract macros"
+        for (Metadata metadata : getRecursiveMetadata("testWORD_macros.doc")) {
+            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+                fail("Shouldn't have extract macros as default");
+            }
+        }
+
+        //now test that they were extracted
+        ParseContext context = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+
+
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Italicize()");
@@ -532,7 +549,14 @@ public class WordParserTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        List<Metadata> metadataList = 
getRecursiveMetadata("testWORD_macros.doc");
+        List<Metadata> metadataList = 
getRecursiveMetadata("testWORD_macros.doc", context);
+        assertContainsAtLeast(minExpected, metadataList);
+
+        //test configuring via config file
+        TikaConfig tikaConfig = new 
TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+
+        metadataList = getRecursiveMetadata("testWORD_macros.doc", parser);
         assertContainsAtLeast(minExpected, metadataList);
     }
 
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 844880d..635b99c 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.sax.SAXTransformerFactory;
@@ -1288,6 +1289,21 @@ public class OOXMLParserTest extends TikaTest {
 
     @Test
     public void testMacrosInDocm() throws Exception {
+
+        //test default is "don't extract macros"
+        for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm")) 
{
+            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+                fail("Shouldn't have extract macros as default");
+            }
+        }
+
+        //now test that they were extracted
+        ParseContext context = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+
+
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Italicize()");
@@ -1295,11 +1311,31 @@ public class OOXMLParserTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testWORD_macros.docm"));
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testWORD_macros.docm", context));
+
+        //test configuring via config file
+        TikaConfig tikaConfig = new 
TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testWORD_macros.docm", parser));
+
     }
 
     @Test
     public void testMacrosInPptm() throws Exception {
+
+        //test default is "don't extract macros"
+        for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) {
+            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+                fail("Shouldn't have extract macros as default");
+            }
+        }
+
+        //now test that they were extracted
+        ParseContext context = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Italicize()");
@@ -1307,11 +1343,31 @@ public class OOXMLParserTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testPPT_macros.pptm"));
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testPPT_macros.pptm", context));
+
+        //test configuring via config file
+        TikaConfig tikaConfig = new 
TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testPPT_macros.pptm", parser));
+
     }
 
     @Test
     public void testMacroinXlsm() throws Exception {
+
+        //test default is "don't extract macros"
+        for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xlsm")) 
{
+            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+                fail("Shouldn't have extract macros as default");
+            }
+        }
+
+        //now test that they were extracted
+        ParseContext context = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Dirty()");
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty 
dirt dirt");
@@ -1319,7 +1375,14 @@ public class OOXMLParserTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testEXCEL_macro.xlsm"));
+        assertContainsAtLeast(minExpected,
+                getRecursiveMetadata("testEXCEL_macro.xlsm", context));
+
+        //test configuring via config file
+        TikaConfig tikaConfig = new 
TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testEXCEL_macro.xlsm", parser));
+
     }
 
     //@Test //use this for lightweight benchmarking to compare xwpf options
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index 305b2e4..6d19c48 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 import java.io.InputStream;
 import java.util.HashMap;
@@ -27,6 +28,7 @@ import java.util.Locale;
 import java.util.Map;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -521,6 +523,29 @@ public class SXSLFExtractorTest extends TikaTest {
 
     @Test
     public void testMacrosInPptm() throws Exception {
+
+        Metadata parsedBy = new Metadata();
+        parsedBy.add("X-Parsed-By",
+                
"org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
+
+        List<Metadata> metadataList = 
getRecursiveMetadata("testPPT_macros.pptm", parseContext);
+
+        //test default is "don't extract macros"
+        for (Metadata metadata : metadataList) {
+            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+                fail("Shouldn't have extract macros as default");
+            }
+        }
+
+        assertContainsAtLeast(parsedBy, metadataList);
+
+        //now test that they are extracted
+        ParseContext context = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
+        officeParserConfig.setUseSAXPptxExtractor(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Italicize()");
@@ -528,6 +553,17 @@ public class SXSLFExtractorTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        assertContainsAtLeast(minExpected, 
getRecursiveMetadata("testPPT_macros.pptm", parseContext));
+        metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
+
+        assertContainsAtLeast(minExpected, metadataList);
+        assertContainsAtLeast(parsedBy, metadataList);
+
+        //test configuring via config file
+        TikaConfig tikaConfig = new 
TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
+        assertContainsAtLeast(minExpected, metadataList);
+        assertContainsAtLeast(parsedBy, metadataList);
+
     }
 }
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 635d0c9..883681e 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -20,6 +20,7 @@ package org.apache.tika.parser.microsoft.ooxml;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
@@ -31,6 +32,7 @@ import java.util.Locale;
 import java.util.Map;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -701,9 +703,32 @@ public class SXWPFExtractorTest extends TikaTest {
 
     @Test
     public void testMacrosInDocm() throws Exception {
+
+        Metadata parsedBy = new Metadata();
+        parsedBy.add("X-Parsed-By",
+                
"org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor");
+
+        //test default is "don't extract macros"
         List<Metadata> metadataList = 
getRecursiveMetadata("testWORD_macros.docm", parseContext);
+        for (Metadata metadata : metadataList) {
+            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+                fail("Shouldn't have extract macros as default");
+            }
+        }
+        assertContainsAtLeast(parsedBy, metadataList);
+
+        //now test that they were extracted
+        ParseContext context = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
+        officeParserConfig.setUseSAXDocxExtractor(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+
+        metadataList = getRecursiveMetadata("testWORD_macros.docm", context);
         //check that content came out of the .docm file
         assertContains("quick", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContainsAtLeast(parsedBy, metadataList);
+
 
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub 
Embolden()");
@@ -712,7 +737,16 @@ public class SXWPFExtractorTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        assertContainsAtLeast(minExpected, metadataList);//, parseContext));
+        assertContainsAtLeast(minExpected, metadataList);
+        assertContainsAtLeast(parsedBy, metadataList);
+
+        //test configuring via config file
+        TikaConfig tikaConfig = new 
TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        metadataList = getRecursiveMetadata("testWORD_macros.docm", parser);
+        assertContainsAtLeast(minExpected, metadataList);
+        assertContainsAtLeast(parsedBy, metadataList);
+
     }
 
     @Test
diff --git 
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
 
b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
new file mode 100644
index 0000000..a0e822f
--- /dev/null
+++ 
b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="extractMacros" type="bool">true</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="extractMacros" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git 
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
 
b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
new file mode 100644
index 0000000..83d890c
--- /dev/null
+++ 
b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="extractMacros" type="bool">true</param>
+                <param name="useSAXDocxExtractor" type="bool">true</param>
+                <param name="useSAXPptxExtractor" type="bool">true</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="extractMacros" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git 
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
 
b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
new file mode 100644
index 0000000..a0e822f
--- /dev/null
+++ 
b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="extractMacros" type="bool">true</param>
+            </params>
+        </parser>
+        <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+            <params>
+                <param name="extractMacros" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>

-- 
To stop receiving notification emails like this one, please contact
"[email protected]" <[email protected]>.

Reply via email to