This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 19c0e916982174da20ee98196db840c7465471eb Author: tballison <[email protected]> AuthorDate: Mon Mar 27 09:46:49 2017 -0400 TIKA-2302 -- make extraction of macros optional in OfficeParsers and set default to false --- CHANGES.txt | 3 + .../parser/microsoft/AbstractOfficeParser.java | 12 ++++ .../apache/tika/parser/microsoft/OfficeParser.java | 10 +++- .../tika/parser/microsoft/OfficeParserConfig.java | 19 ++++++ .../microsoft/ooxml/AbstractOOXMLExtractor.java | 18 ++++-- .../microsoft/ooxml/OOXMLExtractorFactory.java | 8 ++- .../tika/parser/microsoft/ExcelParserTest.java | 22 ++++++- .../parser/microsoft/PowerPointParserTest.java | 8 ++- .../tika/parser/microsoft/WordParserTest.java | 26 +++++++- .../parser/microsoft/ooxml/OOXMLParserTest.java | 69 +++++++++++++++++++++- .../parser/microsoft/ooxml/SXSLFExtractorTest.java | 38 +++++++++++- .../parser/microsoft/ooxml/SXWPFExtractorTest.java | 36 ++++++++++- .../microsoft/ooxml/tika-config-dom-macros.xml | 32 ++++++++++ .../microsoft/ooxml/tika-config-sax-macros.xml | 34 +++++++++++ .../tika/parser/microsoft/tika-config-macros.xml | 32 ++++++++++ 15 files changed, 347 insertions(+), 20 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 8a4c01e..1fe98a7 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,8 @@ Release 1.15 - ?? + * Change default behavior of Office Parsers to _not_ extract + Macros. User needs to setExtractMacros to "true" (TIKA-2302). + * Unified logging across Tika: SLF4J as logging API, Apache Log4j as implementation with JCL and JUL bridges in standalone tools like tika-app, tika-batch and tika-server (TIKA-2245). diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java index e01fe0c..48a756e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java @@ -50,6 +50,13 @@ public abstract class AbstractOfficeParser extends AbstractParser { return defaultOfficeParserConfig.getUseSAXDocxExtractor(); } + /** + * @see OfficeParserConfig#getExtractMacros() + * @return whether or not to extract macros + */ + public boolean getExtractMacros() { + return defaultOfficeParserConfig.getExtractMacros(); + } @Field public void setIncludeDeletedContent(boolean includeDeletedConent) { @@ -70,4 +77,9 @@ public abstract class AbstractOfficeParser extends AbstractParser { public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) { defaultOfficeParserConfig.setUseSAXPptxExtractor(useSAXPptxExtractor); } + + @Field + public void setExtractMacros(boolean extractMacros) { + defaultOfficeParserConfig.setExtractMacros(extractMacros); + } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index 7e21ba8..4bd3804 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -40,6 +40,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.macros.VBAMacroReader; import org.apache.poi.util.IOUtils; +import org.apache.tika.config.Initializable; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -129,10 +130,13 @@ public class OfficeParser extends AbstractOfficeParser { } } parse(root, context, metadata, xhtml); + OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class); - //now try to get macros - extractMacros(root.getNFileSystem(), xhtml, - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); + if (officeParserConfig.getExtractMacros()) { + //now try to get macros + extractMacros(root.getNFileSystem(), xhtml, + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); + } } finally { IOUtils.closeQuietly(mustCloseFs); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index 05275d7..e1947a5 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -21,6 +21,8 @@ import java.io.Serializable; public class OfficeParserConfig implements Serializable { + private boolean extractMacros = false; + private boolean includeDeletedContent = false; private boolean includeMoveFromContent = false; @@ -28,6 +30,23 @@ public class OfficeParserConfig implements Serializable { private boolean useSAXPptxExtractor = false; /** + * Sets whether or not MSOffice parsers should extract macros. + * As of Tika 1.15, the default is <code>false</code>. + * + * @param extractMacros + */ + public void setExtractMacros(boolean extractMacros) { + this.extractMacros = extractMacros; + } + + /** + * + * @return whether or not to extract macros + */ + public boolean getExtractMacros() { + return extractMacros; + } + /** * Sets whether or not the parser should include deleted content. * <p/> * <b>This has only been implemented in the streaming docx parser diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 426092e..26711b2 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -54,6 +54,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; +import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.xmlbeans.XmlException; @@ -91,9 +92,11 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { private final EmbeddedDocumentExtractor embeddedExtractor; + private final ParseContext context; protected POIXMLTextExtractor extractor; public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) { + this.context = context; this.extractor = extractor; embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); } @@ -382,14 +385,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { void handleMacros(PackagePart macroPart, ContentHandler handler) throws TikaException, SAXException { + OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class); - try (InputStream is = macroPart.getInputStream()) { - try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) { - //Macro reading exceptions are already swallowed here - OfficeParser.extractMacros(npoifs, handler, embeddedExtractor); + if (officeParserConfig.getExtractMacros()) { + try (InputStream is = macroPart.getInputStream()) { + try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) { + //Macro reading exceptions are already swallowed here + OfficeParser.extractMacros(npoifs, handler, embeddedExtractor); + } + } catch (IOException e) { + throw new TikaException("Broken OOXML file", e); } - } catch (IOException e) { - throw new TikaException("Broken OOXML file", e); } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 3812bfa..86d74df 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -90,7 +90,9 @@ public class OOXMLExtractorFactory { // Have the appropriate OOXML text extractor picked POIXMLTextExtractor poiExtractor = null; - OfficeParserConfig config = context.get(OfficeParserConfig.class, new OfficeParserConfig()); + //This has already been set by OOXMLParser's call to configure() + //We can rely on this being non-null. + OfficeParserConfig config = context.get(OfficeParserConfig.class); if (config.getUseSAXDocxExtractor()) { poiExtractor = trySXWPF(pkg); } @@ -109,11 +111,11 @@ public class OOXMLExtractorFactory { } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) { extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor); - metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getSimpleName()); + metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName()); } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) { extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor); - metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getSimpleName()); + metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName()); } else if (document == null) { throw new TikaException( "Expecting UserModel based POI OOXML extractor with a document, but none found. " + diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index e6ba7cd..3efaa7c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -27,6 +27,7 @@ import java.util.Locale; import org.apache.poi.util.LocaleUtil; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.EncryptedDocumentException; @@ -480,6 +481,19 @@ public class ExcelParserTest extends TikaTest { @Test public void testMacros() throws Exception { + //test default is "don't extract macros" + for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xls")) { + if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) { + fail("Shouldn't have extract macros as default"); + } + } + + //now test that they were extracted + ParseContext context = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + context.set(OfficeParserConfig.class, officeParserConfig); + Metadata minExpected = new Metadata(); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()"); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt"); @@ -487,7 +501,13 @@ public class ExcelParserTest extends TikaTest { minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls")); + assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls", context)); + + //test configuring via config file + TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml")); + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls", parser)); + } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java index 1cadec4..57335b6 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java @@ -261,7 +261,13 @@ public class PowerPointParserTest extends TikaTest { minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt"); + ParseContext parseContext = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + + + List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt", parseContext); assertContainsAtLeast(minExpected, metadataList); } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java index abb15c7..1dd8bbf 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java @@ -29,11 +29,13 @@ import java.util.Locale; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BodyContentHandler; @@ -525,6 +527,21 @@ public class WordParserTest extends TikaTest { @Test public void testMacros() throws Exception { + + //test default is "don't extract macros" + for (Metadata metadata : getRecursiveMetadata("testWORD_macros.doc")) { + if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) { + fail("Shouldn't have extract macros as default"); + } + } + + //now test that they were extracted + ParseContext context = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + context.set(OfficeParserConfig.class, officeParserConfig); + + Metadata minExpected = new Metadata(); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); @@ -532,7 +549,14 @@ public class WordParserTest extends TikaTest { minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc"); + List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc", context); + assertContainsAtLeast(minExpected, metadataList); + + //test configuring via config file + TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml")); + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + + metadataList = getRecursiveMetadata("testWORD_macros.doc", parser); assertContainsAtLeast(minExpected, metadataList); } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 844880d..635b99c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import javax.xml.transform.OutputKeys; import javax.xml.transform.sax.SAXTransformerFactory; @@ -1288,6 +1289,21 @@ public class OOXMLParserTest extends TikaTest { @Test public void testMacrosInDocm() throws Exception { + + //test default is "don't extract macros" + for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm")) { + if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) { + fail("Shouldn't have extract macros as default"); + } + } + + //now test that they were extracted + ParseContext context = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + context.set(OfficeParserConfig.class, officeParserConfig); + + Metadata minExpected = new Metadata(); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); @@ -1295,11 +1311,31 @@ public class OOXMLParserTest extends TikaTest { minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm")); + assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", context)); + + //test configuring via config file + TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml")); + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parser)); + } @Test public void testMacrosInPptm() throws Exception { + + //test default is "don't extract macros" + for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) { + if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) { + fail("Shouldn't have extract macros as default"); + } + } + + //now test that they were extracted + ParseContext context = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + context.set(OfficeParserConfig.class, officeParserConfig); + Metadata minExpected = new Metadata(); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); @@ -1307,11 +1343,31 @@ public class OOXMLParserTest extends TikaTest { minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm")); + assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", context)); + + //test configuring via config file + TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml")); + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parser)); + } @Test public void testMacroinXlsm() throws Exception { + + //test default is "don't extract macros" + for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xlsm")) { + if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) { + fail("Shouldn't have extract macros as default"); + } + } + + //now test that they were extracted + ParseContext context = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + context.set(OfficeParserConfig.class, officeParserConfig); + Metadata minExpected = new Metadata(); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()"); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt"); @@ -1319,7 +1375,14 @@ public class OOXMLParserTest extends TikaTest { minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm")); + assertContainsAtLeast(minExpected, + getRecursiveMetadata("testEXCEL_macro.xlsm", context)); + + //test configuring via config file + TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml")); + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm", parser)); + } //@Test //use this for lightweight benchmarking to compare xwpf options diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java index 305b2e4..6d19c48 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java @@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.InputStream; import java.util.HashMap; @@ -27,6 +28,7 @@ import java.util.Locale; import java.util.Map; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -521,6 +523,29 @@ public class SXSLFExtractorTest extends TikaTest { @Test public void testMacrosInPptm() throws Exception { + + Metadata parsedBy = new Metadata(); + parsedBy.add("X-Parsed-By", + "org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor"); + + List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm", parseContext); + + //test default is "don't extract macros" + for (Metadata metadata : metadataList) { + if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) { + fail("Shouldn't have extract macros as default"); + } + } + + assertContainsAtLeast(parsedBy, metadataList); + + //now test that they are extracted + ParseContext context = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + officeParserConfig.setUseSAXPptxExtractor(true); + context.set(OfficeParserConfig.class, officeParserConfig); + Metadata minExpected = new Metadata(); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); @@ -528,6 +553,17 @@ public class SXSLFExtractorTest extends TikaTest { minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parseContext)); + metadataList = getRecursiveMetadata("testPPT_macros.pptm", context); + + assertContainsAtLeast(minExpected, metadataList); + assertContainsAtLeast(parsedBy, metadataList); + + //test configuring via config file + TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml")); + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser); + assertContainsAtLeast(minExpected, metadataList); + assertContainsAtLeast(parsedBy, metadataList); + } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java index 635d0c9..883681e 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java @@ -20,6 +20,7 @@ package org.apache.tika.parser.microsoft.ooxml; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.ByteArrayOutputStream; import java.io.InputStream; @@ -31,6 +32,7 @@ import java.util.Locale; import java.util.Map; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -701,9 +703,32 @@ public class SXWPFExtractorTest extends TikaTest { @Test public void testMacrosInDocm() throws Exception { + + Metadata parsedBy = new Metadata(); + parsedBy.add("X-Parsed-By", + "org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor"); + + //test default is "don't extract macros" List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext); + for (Metadata metadata : metadataList) { + if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) { + fail("Shouldn't have extract macros as default"); + } + } + assertContainsAtLeast(parsedBy, metadataList); + + //now test that they were extracted + ParseContext context = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + officeParserConfig.setUseSAXDocxExtractor(true); + context.set(OfficeParserConfig.class, officeParserConfig); + + metadataList = getRecursiveMetadata("testWORD_macros.docm", context); //check that content came out of the .docm file assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContainsAtLeast(parsedBy, metadataList); + Metadata minExpected = new Metadata(); minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); @@ -712,7 +737,16 @@ public class SXWPFExtractorTest extends TikaTest { minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); - assertContainsAtLeast(minExpected, metadataList);//, parseContext)); + assertContainsAtLeast(minExpected, metadataList); + assertContainsAtLeast(parsedBy, metadataList); + + //test configuring via config file + TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml")); + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + metadataList = getRecursiveMetadata("testWORD_macros.docm", parser); + assertContainsAtLeast(minExpected, metadataList); + assertContainsAtLeast(parsedBy, metadataList); + } @Test diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml new file mode 100644 index 0000000..a0e822f --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> + <params> + <param name="extractMacros" type="bool">true</param> + </params> + </parser> + <parser class="org.apache.tika.parser.microsoft.OfficeParser"> + <params> + <param name="extractMacros" type="bool">true</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml new file mode 100644 index 0000000..83d890c --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml @@ -0,0 +1,34 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> + <params> + <param name="extractMacros" type="bool">true</param> + <param name="useSAXDocxExtractor" type="bool">true</param> + <param name="useSAXPptxExtractor" type="bool">true</param> + </params> + </parser> + <parser class="org.apache.tika.parser.microsoft.OfficeParser"> + <params> + <param name="extractMacros" type="bool">true</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml new file mode 100644 index 0000000..a0e822f --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> + <params> + <param name="extractMacros" type="bool">true</param> + </params> + </parser> + <parser class="org.apache.tika.parser.microsoft.OfficeParser"> + <params> + <param name="extractMacros" type="bool">true</param> + </params> + </parser> + </parsers> +</properties> -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
