This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new 1826112 TIKA-2302 -- make macro extraction configurable and set
default to false
new 3e92516 Merge remote-tracking branch 'origin/2.x' into 2.x
1826112 is described below
commit 1826112e6c3bfd4001cef896279263ccbe0a1923
Author: tballison <[email protected]>
AuthorDate: Mon Mar 27 10:17:49 2017 -0400
TIKA-2302 -- make macro extraction configurable and set default to false
---
CHANGES.txt | 3 +
.../parser/microsoft/AbstractOfficeParser.java | 12 ++++
.../apache/tika/parser/microsoft/OfficeParser.java | 9 ++-
.../tika/parser/microsoft/OfficeParserConfig.java | 19 ++++++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 18 ++++--
.../microsoft/ooxml/OOXMLExtractorFactory.java | 8 ++-
.../tika/parser/microsoft/ExcelParserTest.java | 23 +++++++-
.../parser/microsoft/PowerPointParserTest.java | 8 ++-
.../tika/parser/microsoft/WordParserTest.java | 24 +++++++-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 69 +++++++++++++++++++++-
.../parser/microsoft/ooxml/SXSLFExtractorTest.java | 36 ++++++++++-
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 35 ++++++++++-
.../microsoft/ooxml/tika-config-dom-macros.xml | 32 ++++++++++
.../microsoft/ooxml/tika-config-sax-macros.xml | 34 +++++++++++
.../tika/parser/microsoft/tika-config-macros.xml | 32 ++++++++++
.../tika/parser/microsoft/tika-config-sax-docx.xml | 27 +++++++++
16 files changed, 368 insertions(+), 21 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 857a2a4..a7045ea 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
Release 1.15 -???
+ * Change default behavior of Office Parsers to _not_ extract
+ Macros. User needs to setExtractMacros to "true" (TIKA-2302).
+
* Upgrade PDFBox to 2.0.5 and JempBox to 1.8.13 (TIKA-2236).
* Extract images and thumbnails from ODT via Sam Bayer (TIKA-2295).
diff --git
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index 2538219..a47a411 100644
---
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -49,6 +49,18 @@ public abstract class AbstractOfficeParser extends
AbstractParser {
return defaultOfficeParserConfig.getUseSAXDocxExtractor();
}
+ /**
+ * @see OfficeParserConfig#getExtractMacros()
+ * @return whether or not to extract macros
+ */
+ public boolean getExtractMacros() {
+ return defaultOfficeParserConfig.getExtractMacros();
+ }
+
+ //@Field
+ public void setExtractMacros(boolean extractMacros) {
+ defaultOfficeParserConfig.setExtractMacros(extractMacros);
+ }
// @Field
public void setIncludeDeletedContent(boolean includeDeletedConent) {
diff --git
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 7e21ba8..31edcf5 100644
---
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -129,10 +129,13 @@ public class OfficeParser extends AbstractOfficeParser {
}
}
parse(root, context, metadata, xhtml);
+ OfficeParserConfig officeParserConfig =
context.get(OfficeParserConfig.class);
- //now try to get macros
- extractMacros(root.getNFileSystem(), xhtml,
-
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+ if (officeParserConfig.getExtractMacros()) {
+ //now try to get macros
+ extractMacros(root.getNFileSystem(), xhtml,
+
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+ }
} finally {
IOUtils.closeQuietly(mustCloseFs);
}
diff --git
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 05275d7..e1947a5 100644
---
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -21,6 +21,8 @@ import java.io.Serializable;
public class OfficeParserConfig implements Serializable {
+ private boolean extractMacros = false;
+
private boolean includeDeletedContent = false;
private boolean includeMoveFromContent = false;
@@ -28,6 +30,23 @@ public class OfficeParserConfig implements Serializable {
private boolean useSAXPptxExtractor = false;
/**
+ * Sets whether or not MSOffice parsers should extract macros.
+ * As of Tika 1.15, the default is <code>false</code>.
+ *
+ * @param extractMacros
+ */
+ public void setExtractMacros(boolean extractMacros) {
+ this.extractMacros = extractMacros;
+ }
+
+ /**
+ *
+ * @return whether or not to extract macros
+ */
+ public boolean getExtractMacros() {
+ return extractMacros;
+ }
+ /**
* Sets whether or not the parser should include deleted content.
* <p/>
* <b>This has only been implemented in the streaming docx parser
diff --git
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 426092e..26711b2 100644
---
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -54,6 +54,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
@@ -91,9 +92,11 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
private final EmbeddedDocumentExtractor embeddedExtractor;
+ private final ParseContext context;
protected POIXMLTextExtractor extractor;
public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor
extractor) {
+ this.context = context;
this.extractor = extractor;
embeddedExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
@@ -382,14 +385,17 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
void handleMacros(PackagePart macroPart, ContentHandler handler) throws
TikaException, SAXException {
+ OfficeParserConfig officeParserConfig =
context.get(OfficeParserConfig.class);
- try (InputStream is = macroPart.getInputStream()) {
- try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
- //Macro reading exceptions are already swallowed here
- OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
+ if (officeParserConfig.getExtractMacros()) {
+ try (InputStream is = macroPart.getInputStream()) {
+ try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
+ //Macro reading exceptions are already swallowed here
+ OfficeParser.extractMacros(npoifs, handler,
embeddedExtractor);
+ }
+ } catch (IOException e) {
+ throw new TikaException("Broken OOXML file", e);
}
- } catch (IOException e) {
- throw new TikaException("Broken OOXML file", e);
}
}
diff --git
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index c2386c6..3443cf5 100644
---
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -90,7 +90,9 @@ public class OOXMLExtractorFactory {
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
- OfficeParserConfig config = context.get(OfficeParserConfig.class,
new OfficeParserConfig());
+ //This has already been set by OOXMLParser's call to configure()
+ //We can rely on this being non-null.
+ OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
@@ -108,11 +110,11 @@ public class OOXMLExtractorFactory {
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context,
(XWPFEventBasedWordExtractor) poiExtractor);
- metadata.add("X-Parsed-By",
XWPFEventBasedWordExtractor.class.getSimpleName());
+ metadata.add("X-Parsed-By",
XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof
XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata,
context,
(XSLFEventBasedPowerPointExtractor) poiExtractor);
- metadata.add("X-Parsed-By",
XSLFEventBasedPowerPointExtractor.class.getSimpleName());
+ metadata.add("X-Parsed-By",
XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException(
"Expecting UserModel based POI OOXML extractor with a
document, but none found. " +
diff --git
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 03900e5..265453f 100644
---
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -425,14 +425,33 @@ public class ExcelParserTest extends TikaTest {
"<td>1"+symbols.getDecimalSeparator()+"23456789012345E15",
xml);
}
+ @Test
public void testMacros() throws Exception {
+ //test default is "don't extract macros"
+ for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xls")) {
+ if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+ fail("Shouldn't have extracted macros as default");
+ }
+ }
+
+ //now test that they were extracted
+ ParseContext context = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Dirty()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty
dirt dirt");
- minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+ assertContainsAtLeast(minExpected,
getRecursiveMetadata("testEXCEL_macro.xls", context));
+
+ //test configuring via config file
+ /*TikaConfig tikaConfig = new
TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ assertContainsAtLeast(minExpected,
getRecursiveMetadata("testEXCEL_macro.xls", parser));*/
- assertContainsAtLeast(minExpected,
getRecursiveMetadata("testEXCEL_macro.xls"));
}
}
diff --git
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 152b7be..31b5ae9 100644
---
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -261,7 +261,13 @@ public class PowerPointParserTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- List<Metadata> metadataList =
getRecursiveMetadata("testPPT_macros.ppt");
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+
+
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_macros.ppt", parseContext);
assertContainsAtLeast(minExpected, metadataList);
}
diff --git
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index abb15c7..c74891f 100644
---
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -525,6 +525,21 @@ public class WordParserTest extends TikaTest {
@Test
public void testMacros() throws Exception {
+
+ //test default is "don't extract macros"
+ for (Metadata metadata : getRecursiveMetadata("testWORD_macros.doc")) {
+ if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+ fail("Shouldn't have extracted macros as default");
+ }
+ }
+
+ //now test that they were extracted
+ ParseContext context = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Italicize()");
@@ -532,8 +547,15 @@ public class WordParserTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- List<Metadata> metadataList =
getRecursiveMetadata("testWORD_macros.doc");
+ List<Metadata> metadataList =
getRecursiveMetadata("testWORD_macros.doc", context);
assertContainsAtLeast(minExpected, metadataList);
+/*
+ //test configuring via config file
+ TikaConfig tikaConfig = new
TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+
+ metadataList = getRecursiveMetadata("testWORD_macros.doc", parser);
+ assertContainsAtLeast(minExpected, metadataList);*/
}
@Test
diff --git
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index fa92a7e..f555617 100644
---
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
@@ -1272,6 +1273,21 @@ public class OOXMLParserTest extends TikaTest {
@Test
public void testMacrosInDocm() throws Exception {
+
+ //test default is "don't extract macros"
+ for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm"))
{
+ if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+ fail("Shouldn't have extract macros as default");
+ }
+ }
+
+ //now test that they were extracted
+ ParseContext context = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Italicize()");
@@ -1279,11 +1295,31 @@ public class OOXMLParserTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- assertContainsAtLeast(minExpected,
getRecursiveMetadata("testWORD_macros.docm"));
+ assertContainsAtLeast(minExpected,
getRecursiveMetadata("testWORD_macros.docm", context));
+/*
+ //test configuring via config file
+ TikaConfig tikaConfig = new
TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ assertContainsAtLeast(minExpected,
getRecursiveMetadata("testWORD_macros.docm", parser));
+*/
}
@Test
public void testMacrosInPptm() throws Exception {
+
+ //test default is "don't extract macros"
+ for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) {
+ if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+ fail("Shouldn't have extract macros as default");
+ }
+ }
+
+ //now test that they were extracted
+ ParseContext context = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Italicize()");
@@ -1291,11 +1327,31 @@ public class OOXMLParserTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- assertContainsAtLeast(minExpected,
getRecursiveMetadata("testPPT_macros.pptm"));
+ assertContainsAtLeast(minExpected,
getRecursiveMetadata("testPPT_macros.pptm", context));
+/*
+ //test configuring via config file
+ TikaConfig tikaConfig = new
TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ assertContainsAtLeast(minExpected,
getRecursiveMetadata("testPPT_macros.pptm", parser));
+*/
}
@Test
public void testMacroinXlsm() throws Exception {
+
+ //test default is "don't extract macros"
+ for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xlsm"))
{
+ if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+ fail("Shouldn't have extract macros as default");
+ }
+ }
+
+ //now test that they were extracted
+ ParseContext context = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Dirty()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty
dirt dirt");
@@ -1303,7 +1359,14 @@ public class OOXMLParserTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- assertContainsAtLeast(minExpected,
getRecursiveMetadata("testEXCEL_macro.xlsm"));
+ assertContainsAtLeast(minExpected,
+ getRecursiveMetadata("testEXCEL_macro.xlsm", context));
+/*
+ //test configuring via config file
+ TikaConfig tikaConfig = new
TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ assertContainsAtLeast(minExpected,
getRecursiveMetadata("testEXCEL_macro.xlsm", parser));
+*/
}
//@Test //use this for lightweight benchmarking to compare xwpf options
diff --git
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index c949885..68f67a1 100644
---
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
import java.io.InputStream;
import java.util.HashMap;
@@ -514,6 +515,29 @@ public class SXSLFExtractorTest extends TikaTest {
@Test
public void testMacrosInPptm() throws Exception {
+
+ Metadata parsedBy = new Metadata();
+ parsedBy.add("X-Parsed-By",
+
"org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
+
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_macros.pptm", parseContext);
+
+ //test default is "don't extract macros"
+ for (Metadata metadata : metadataList) {
+ if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+ fail("Shouldn't have extract macros as default");
+ }
+ }
+
+ assertContainsAtLeast(parsedBy, metadataList);
+
+ //now test that they are extracted
+ ParseContext context = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ officeParserConfig.setUseSAXPptxExtractor(true);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Italicize()");
@@ -521,6 +545,16 @@ public class SXSLFExtractorTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- assertContainsAtLeast(minExpected,
getRecursiveMetadata("testPPT_macros.pptm", parseContext));
+ metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
+
+ assertContainsAtLeast(minExpected, metadataList);
+ assertContainsAtLeast(parsedBy, metadataList);
+/*
+ //test configuring via config file
+ TikaConfig tikaConfig = new
TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
+ assertContainsAtLeast(minExpected, metadataList);
+ assertContainsAtLeast(parsedBy, metadataList);*/
}
}
diff --git
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 635d0c9..9cb2ef8 100644
---
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -20,6 +20,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
@@ -701,9 +702,32 @@ public class SXWPFExtractorTest extends TikaTest {
@Test
public void testMacrosInDocm() throws Exception {
+
+ Metadata parsedBy = new Metadata();
+ parsedBy.add("X-Parsed-By",
+
"org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor");
+
+ //test default is "don't extract macros"
List<Metadata> metadataList =
getRecursiveMetadata("testWORD_macros.docm", parseContext);
+ for (Metadata metadata : metadataList) {
+ if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
+ fail("Shouldn't have extract macros as default");
+ }
+ }
+ assertContainsAtLeast(parsedBy, metadataList);
+
+ //now test that they were extracted
+ ParseContext context = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
+ officeParserConfig.setUseSAXDocxExtractor(true);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+
+ metadataList = getRecursiveMetadata("testWORD_macros.docm", context);
//check that content came out of the .docm file
assertContains("quick",
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContainsAtLeast(parsedBy, metadataList);
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Embolden()");
@@ -712,7 +736,16 @@ public class SXWPFExtractorTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- assertContainsAtLeast(minExpected, metadataList);//, parseContext));
+ assertContainsAtLeast(minExpected, metadataList);
+ assertContainsAtLeast(parsedBy, metadataList);
+/*
+ //test configuring via config file
+ TikaConfig tikaConfig = new
TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ metadataList = getRecursiveMetadata("testWORD_macros.docm", parser);
+ assertContainsAtLeast(minExpected, metadataList);
+ assertContainsAtLeast(parsedBy, metadataList);
+*/
}
@Test
diff --git
a/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
b/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
new file mode 100644
index 0000000..a0e822f
--- /dev/null
+++
b/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
b/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
new file mode 100644
index 0000000..83d890c
--- /dev/null
+++
b/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ <param name="useSAXDocxExtractor" type="bool">true</param>
+ <param name="useSAXPptxExtractor" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
b/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
new file mode 100644
index 0000000..a0e822f
--- /dev/null
+++
b/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
b/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
new file mode 100644
index 0000000..cad9c5a
--- /dev/null
+++
b/tika-test-resources/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="useSAXDocxExtractor" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].