Repository: tika Updated Branches: refs/heads/master 415381212 -> 2ae7206d9
TIKA-2069 -- extract macros from MSOffice docs Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2ae7206d Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2ae7206d Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2ae7206d Branch: refs/heads/master Commit: 2ae7206d9c99fb553314cff21bb155d4e6f06d12 Parents: 4153812 Author: tballison <[email protected]> Authored: Wed Sep 21 21:03:20 2016 -0400 Committer: tballison <[email protected]> Committed: Wed Sep 21 21:03:20 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 2 + .../tika/metadata/TikaCoreProperties.java | 14 ++++- .../tika/parser/microsoft/OfficeParser.java | 50 +++++++++++++++ .../microsoft/ooxml/AbstractOOXMLExtractor.java | 20 +++++- .../ooxml/XSLFPowerPointExtractorDecorator.java | 3 + .../ooxml/XSSFExcelExtractorDecorator.java | 7 +++ .../tika/parser/microsoft/ExcelParserTest.java | 12 ++++ .../parser/microsoft/PowerPointParserTest.java | 13 ++++ .../tika/parser/microsoft/WordParserTest.java | 13 ++++ .../parser/microsoft/ooxml/OOXMLParserTest.java | 63 ++++++++++++++----- .../test-documents/testEXCEL_macro.xls | Bin 0 -> 30720 bytes .../test-documents/testEXCEL_macro.xlsm | Bin 0 -> 14561 bytes .../resources/test-documents/testPPT_macros.ppt | Bin 0 -> 88064 bytes .../test-documents/testPPT_macros.pptm | Bin 0 -> 32824 bytes .../test-documents/testWORD_macros.doc | Bin 0 -> 38400 bytes .../test-documents/testWORD_macros.docm | Bin 0 -> 17322 bytes 16 files changed, 178 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index fc94e70..9a03b01 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 1.14 - ??? + * Extract macros from MSOffice files (TIKA-2069). + * Maintain passed-in mime in TXTParser (TIKA-2047). * Upgrade to POI.3-15 (TIKA-2013). http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index f4b97dd..9245086 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -39,16 +39,24 @@ public interface TikaCoreProperties { /** * A file might contain different types of embedded documents. - * The most common is the ATTACHEMENT. + * The most common is the ATTACHMENT. + * <p> * An INLINE embedded resource should be used for embedded image * files that are used to render the page image (as in PDXObjImages in PDF files). * <p> - * Not all parsers have yet implemented this. + * A MACRO is code that is embedded in the document and is intended + * to be executable within the application that opens the document. This + * includes traditional macros within Microsoft Office files and + * javascript within PDFActions. This would not include, e.g., an + * .exe file embedded in a .zip file. + * <p> + * Not all parsers have yet implemented this. * */ public enum EmbeddedResourceType { INLINE, - ATTACHMENT + ATTACHMENT, + MACRO }; /** http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index b6681aa..f7f1c4a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -16,13 +16,16 @@ */ package org.apache.tika.parser.microsoft; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.security.GeneralSecurityException; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Locale; +import java.util.Map; import java.util.Set; import org.apache.commons.io.input.CloseShieldInputStream; @@ -35,11 +38,15 @@ import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.poifs.macros.VBAMacroReader; import org.apache.poi.util.IOUtils; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; @@ -117,9 +124,17 @@ public class OfficeParser extends AbstractParser { //tstream will close the fs, no need to close this below tstream.setOpenContainer(fs); root = fs.getRoot(); + } } parse(root, context, metadata, xhtml); + + //now try to get macros + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + if (ex == null) { + ex = new ParsingEmbeddedDocumentExtractor(context); + } + extractMacros(root.getNFileSystem(), xhtml, ex); } finally { IOUtils.closeQuietly(mustCloseFs); } @@ -279,4 +294,39 @@ public class OfficeParser extends AbstractParser { } } + /** + * Helper to extract macros from an NPOIFS/vbaProject.bin + * + * As of POI-3.15-final, there are still some bugs in VBAMacroReader. + * For now, we are swallowing NPE and other runtime exceptions + * + * @param fs NPOIFS to extract from + * @param xhtml SAX writer + * @param embeddedDocumentExtractor extractor for embedded documents + * @throws IOException on IOException if it occurs during the extraction of the embedded doc + * @throws SAXException on SAXException for writing to xhtml + */ + public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml, EmbeddedDocumentExtractor + embeddedDocumentExtractor) throws IOException, SAXException { + + VBAMacroReader reader = null; + Map<String, String> macros = null; + try { + reader = new VBAMacroReader(fs); + macros = reader.readMacros(); + } catch (Exception e) { + //swallow + return; + } + for (Map.Entry<String, String> e : macros.entrySet()) { + Metadata m = new Metadata(); + m.set(Metadata.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + m.set(Metadata.CONTENT_TYPE, "text/x-vbasic"); + if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { + embeddedDocumentExtractor.parseEmbedded( + new ByteArrayInputStream(e.getValue().getBytes(StandardCharsets.UTF_8)), xhtml, m, true); + } + } + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 67468b0..1f16a3c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -33,6 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -43,6 +44,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; @@ -64,7 +66,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"; static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject"; static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package"; - + static final String RELATION_MACRO = "http://schemas.microsoft.com/office/2006/relationships/vbaProject"; + static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; private static final String TYPE_OLE_OBJECT = "application/vnd.openxmlformats-officedocument.oleObject"; private final EmbeddedDocumentExtractor embeddedExtractor; @@ -197,6 +200,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { || RELATION_PACKAGE.equals(type) || RELATION_OLE_OBJECT.equals(type)) { handleEmbeddedFile(target, handler, sourceDesc + rel.getId()); + } else if (RELATION_MACRO.equals(type)) { + handleMacros(target, handler); } } } @@ -325,4 +330,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { */ protected abstract List<PackagePart> getMainDocumentParts() throws TikaException; + + + void handleMacros(PackagePart macroPart, ContentHandler handler) throws TikaException, SAXException { + + try (InputStream is = macroPart.getInputStream()) { + try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) { + //Macro reading exceptions are already swallowed here + OfficeParser.extractMacros(npoifs, handler, embeddedExtractor); + } + } catch (IOException e) { + throw new TikaException("Broken OOXML file", e); + } + } } http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java index 0ea58c0..160f761 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java @@ -280,6 +280,9 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { } } } + //add full document to include macros + parts.add(document.getPackagePart()); + return parts; } } http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index ae8b6cb..0f6957c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -320,6 +320,13 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } + //add main document so that macros can be extracted + //by AbstractOOXMLExtractor + for (PackagePart part : extractor.getPackage(). + getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) { + parts.add(part); + } + return parts; } http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java index cb93b55..eb1a814 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java @@ -472,4 +472,16 @@ public class ExcelParserTest extends TikaTest { assertContains("1.23456789012345E15", xml);//16 digit number is treated as scientific notation assertContains("1.23456789012345E15", xml);//16 digit formula, ditto } + + @Test + public void testMacroinXls() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xls"); + Metadata macroMetadata = metadataList.get(1); + assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), + macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java index ca20be7..41400c5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java @@ -30,6 +30,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BodyContentHandler; +import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -249,4 +250,16 @@ public class PowerPointParserTest extends TikaTest { assertContains("Hello World", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT)); assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY)); } + + @Test + @Ignore("POI 3.15-final not finding any macros in this ppt") + public void testMacros() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt"); + Metadata macroMetadata = metadataList.get(1); + assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), + macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } } http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java index 8b42ff1..e63a61b 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java @@ -34,6 +34,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BodyContentHandler; import org.junit.Ignore; import org.junit.Test; @@ -520,5 +521,17 @@ public class WordParserTest extends TikaTest { assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml); assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml); } + + @Test + public void testMacros() throws Exception { + //debug(getRecursiveMetadata("SimpleMacro.doc")); + List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc"); + Metadata macroMetadata = metadataList.get(1); + assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), + macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } } http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index ac62b03..ccfb293 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -578,39 +578,39 @@ public class OOXMLParserTest extends TikaTest { assertContains("Here is a citation:", content); assertContains("Figure 1 This is a caption for Figure 1", content); assertContains("(Kramer)", content); - assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," ")); - assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," ")); + assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " ")); + assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " ")); assertContains("This is a hyperlink", content); assertContains("Here is a list:", content); - for(int row=1;row<=3;row++) { + for (int row = 1; row <= 3; row++) { //assertContains("·\tBullet " + row, content); //assertContains("\u00b7\tBullet " + row, content); assertContains("Bullet " + row, content); } assertContains("Here is a numbered list:", content); - for(int row=1;row<=3;row++) { + for (int row = 1; row <= 3; row++) { //assertContains(row + ")\tNumber bullet " + row, content); //assertContains(row + ") Number bullet " + row, content); // TODO: OOXMLExtractor fails to number the bullets: assertContains("Number bullet " + row, content); } - for(int row=1;row<=2;row++) { - for(int col=1;col<=3;col++) { + for (int row = 1; row <= 2; row++) { + for (int col = 1; col <= 3; col++) { assertContains("Row " + row + " Col " + col, content); } } assertContains("Keyword1 Keyword2", content); assertEquals("Keyword1 Keyword2", - metadata.get(Metadata.KEYWORDS)); + metadata.get(Metadata.KEYWORDS)); assertContains("Subject is here", content); // TODO: Remove subject in Tika 2.0 assertEquals("Subject is here", - metadata.get(Metadata.SUBJECT)); + metadata.get(Metadata.SUBJECT)); assertEquals("Subject is here", - metadata.get(OfficeOpenXMLCore.SUBJECT)); + metadata.get(OfficeOpenXMLCore.SUBJECT)); assertContains("Suddenly some Japanese text:", content); // Special version of (GHQ) @@ -642,21 +642,21 @@ public class OOXMLParserTest extends TikaTest { assertContains("<p>Row 2 column 2</p>", xml); assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml); assertContains("<p>Here is a list:", xml); - for(int row=1;row<=3;row++) { + for (int row = 1; row <= 3; row++) { //assertContains("·\tBullet " + row, content); //assertContains("\u00b7\tBullet " + row, content); assertContains("<p>Bullet " + row, xml); } assertContains("Here is a numbered list:", xml); - for(int row=1;row<=3;row++) { + for (int row = 1; row <= 3; row++) { //assertContains(row + ")\tNumber bullet " + row, content); //assertContains(row + ") Number bullet " + row, content); // TODO: OOXMLExtractor fails to number the bullets: assertContains("<p>Number bullet " + row, xml); } - for(int row=1;row<=2;row++) { - for(int col=1;col<=3;col++) { + for (int row = 1; row <= 2; row++) { + for (int col = 1; col <= 3; col++) { assertContains("Row " + row + " Col " + col, xml); } } @@ -668,7 +668,7 @@ public class OOXMLParserTest extends TikaTest { assertContains("Subject is here", xml); // TODO: Remove subject in Tika 2.0 assertEquals("Subject is here", - metadata.get(Metadata.SUBJECT)); + metadata.get(Metadata.SUBJECT)); assertEquals("Subject is here", metadata.get(OfficeOpenXMLCore.SUBJECT)); @@ -1254,7 +1254,7 @@ public class OOXMLParserTest extends TikaTest { String xml = getXML("testWORD_boldHyperlink.docx").xml; xml = xml.replaceAll("\\s+", " "); assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml); - assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml); + assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml); } @Test @@ -1263,6 +1263,39 @@ public class OOXMLParserTest extends TikaTest { assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx").xml); } + @Test + public void testMacrosInDocm() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm"); + Metadata macroMetadata = metadataList.get(1); + assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), + macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } + + @Test + public void testMacrosInPptm() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm"); + Metadata macroMetadata = metadataList.get(1); + assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), + macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } + + @Test + public void testMacroinXlsm() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xlsm"); + Metadata macroMetadata = metadataList.get(1); + assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(), + macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls new file mode 100644 index 0000000..b97f9b2 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls differ http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm new file mode 100644 index 0000000..d21452b Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm differ http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt new file mode 100644 index 0000000..7af9008 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt differ http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm b/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm new file mode 100644 index 0000000..058a039 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm differ http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc b/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc new file mode 100644 index 0000000..838d86b Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc differ http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm b/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm new file mode 100644 index 0000000..a915310 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm differ
