Repository: tika
Updated Branches:
  refs/heads/2.x 32d9ece8d -> 66f433471


TIKA-2069 -- extract macros from MSOffice files.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/66f43347
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/66f43347
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/66f43347

Branch: refs/heads/2.x
Commit: 66f433471f59d5af931f0a49bf8bddd33a7f27a7
Parents: 32d9ece
Author: tballison <talli...@mitre.org>
Authored: Wed Sep 21 21:08:46 2016 -0400
Committer: tballison <talli...@mitre.org>
Committed: Wed Sep 21 21:08:46 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../tika/metadata/TikaCoreProperties.java       |  14 +++-
 .../tika/parser/microsoft/OfficeParser.java     |  50 +++++++++++++++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |  20 +++++-
 .../ooxml/XSLFPowerPointExtractorDecorator.java |   3 +
 .../ooxml/XSSFExcelExtractorDecorator.java      |   7 ++
 .../tika/parser/microsoft/ExcelParserTest.java  |  12 ++++
 .../parser/microsoft/PowerPointParserTest.java  |  18 +++++-
 .../tika/parser/microsoft/WordParserTest.java   |  13 ++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  64 ++++++++++++++-----
 .../test-documents/testEXCEL_macro.xls          | Bin 0 -> 30720 bytes
 .../test-documents/testEXCEL_macro.xlsm         | Bin 0 -> 14561 bytes
 .../resources/test-documents/testPPT_macros.ppt | Bin 0 -> 88064 bytes
 .../test-documents/testPPT_macros.pptm          | Bin 0 -> 32824 bytes
 .../test-documents/testWORD_macros.doc          | Bin 0 -> 38400 bytes
 .../test-documents/testWORD_macros.docm         | Bin 0 -> 17322 bytes
 16 files changed, 182 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 46a5894..53f9a82 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,8 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
+  * Extract macros from MSOffice files (TIKA-2069).
+
   * Maintain passed-in mime in TXTParser (TIKA-2047).
 
   * Upgrade to POI 3.15-final (TIKA-2013).

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index f4b97dd..9245086 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -39,16 +39,24 @@ public interface TikaCoreProperties {
 
     /**
      * A file might contain different types of embedded documents.
-     * The most common is the ATTACHEMENT.
+     * The most common is the ATTACHMENT.
+     * <p>
      * An INLINE embedded resource should be used for embedded image
      * files that are used to render the page image (as in PDXObjImages in PDF 
files).
      * <p>
-     * Not all parsers have yet implemented this. 
+     * A MACRO is code that is embedded in the document and is intended
+     * to be executable within the application that opens the document.  This
+     * includes traditional macros within Microsoft Office files and
+     * javascript within PDFActions.  This would not include, e.g., an
+     * .exe file embedded in a .zip file.
+     * <p>
+     * Not all parsers have yet implemented this.
      *
      */
     public enum EmbeddedResourceType {
         INLINE,
-        ATTACHMENT
+        ATTACHMENT,
+        MACRO
     };
 
     /**

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b6681aa..f7f1c4a 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -16,13 +16,16 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 import java.security.GeneralSecurityException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
@@ -35,11 +38,15 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.macros.VBAMacroReader;
 import org.apache.poi.util.IOUtils;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
@@ -117,9 +124,17 @@ public class OfficeParser extends AbstractParser {
                     //tstream will close the fs, no need to close this below
                     tstream.setOpenContainer(fs);
                     root = fs.getRoot();
+
                 }
             }
             parse(root, context, metadata, xhtml);
+
+            //now try to get macros
+            EmbeddedDocumentExtractor ex = 
context.get(EmbeddedDocumentExtractor.class);
+            if (ex == null) {
+                ex = new ParsingEmbeddedDocumentExtractor(context);
+            }
+            extractMacros(root.getNFileSystem(), xhtml, ex);
         } finally {
             IOUtils.closeQuietly(mustCloseFs);
         }
@@ -279,4 +294,39 @@ public class OfficeParser extends AbstractParser {
         }
     }
 
+    /**
+     * Helper to extract macros from an NPOIFS/vbaProject.bin
+     *
+     * As of POI-3.15-final, there are still some bugs in VBAMacroReader.
+     * For now, we are swallowing NPE and other runtime exceptions
+     *
+     * @param fs NPOIFS to extract from
+     * @param xhtml SAX writer
+     * @param embeddedDocumentExtractor extractor for embedded documents
+     * @throws IOException on IOException if it occurs during the extraction 
of the embedded doc
+     * @throws SAXException on SAXException for writing to xhtml
+     */
+    public static void extractMacros(NPOIFSFileSystem fs, ContentHandler 
xhtml, EmbeddedDocumentExtractor
+            embeddedDocumentExtractor)  throws IOException, SAXException {
+
+        VBAMacroReader reader = null;
+        Map<String, String> macros = null;
+        try {
+            reader = new VBAMacroReader(fs);
+            macros = reader.readMacros();
+        } catch (Exception e) {
+            //swallow
+            return;
+        }
+        for (Map.Entry<String, String> e : macros.entrySet()) {
+            Metadata m = new Metadata();
+            m.set(Metadata.EMBEDDED_RESOURCE_TYPE, 
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+            m.set(Metadata.CONTENT_TYPE, "text/x-vbasic");
+            if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+                embeddedDocumentExtractor.parseEmbedded(
+                        new 
ByteArrayInputStream(e.getValue().getBytes(StandardCharsets.UTF_8)), xhtml, m, 
true);
+            }
+        }
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 67468b0..1f16a3c 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -33,6 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -43,6 +44,7 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -64,7 +66,8 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
     static final String RELATION_IMAGE = 
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";;
     static final String RELATION_OLE_OBJECT = 
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";;
     static final String RELATION_PACKAGE = 
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";;
-
+    static final String RELATION_MACRO = 
"http://schemas.microsoft.com/office/2006/relationships/vbaProject";;
+    static final String RELATION_OFFICE_DOCUMENT = 
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";;
     private static final String TYPE_OLE_OBJECT =
             "application/vnd.openxmlformats-officedocument.oleObject";
     private final EmbeddedDocumentExtractor embeddedExtractor;
@@ -197,6 +200,8 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
                                 || RELATION_PACKAGE.equals(type)
                                 || RELATION_OLE_OBJECT.equals(type)) {
                             handleEmbeddedFile(target, handler, sourceDesc + 
rel.getId());
+                        } else if (RELATION_MACRO.equals(type)) {
+                            handleMacros(target, handler);
                         }
                     }
                 }
@@ -325,4 +330,17 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
      */
     protected abstract List<PackagePart> getMainDocumentParts()
             throws TikaException;
+
+
+    void handleMacros(PackagePart macroPart, ContentHandler handler) throws 
TikaException, SAXException {
+
+        try (InputStream is = macroPart.getInputStream()) {
+            try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
+                //Macro reading exceptions are already swallowed here
+                OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
+            }
+        } catch (IOException e) {
+            throw new TikaException("Broken OOXML file", e);
+        }
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index a03eec6..aaef789 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -280,6 +280,9 @@ public class XSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
                 }
             }
         }
+        //add full document to include macros
+        parts.add(document.getPackagePart());
+
         return parts;
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index d375dd9..912469f 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -320,6 +320,13 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             }
         }
 
+        //add main document so that macros can be extracted
+        //by AbstractOOXMLExtractor
+        for (PackagePart part : extractor.getPackage().
+                getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+            parts.add(part);
+        }
+
         return parts;
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 6f411f5..b136a5d 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -36,6 +36,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
@@ -419,4 +420,15 @@ public class ExcelParserTest extends TikaTest {
         assertContains("1.23456789012345E15", xml);//16 digit number is 
treated as scientific notation
         assertContains("1.23456789012345E15", xml);//16 digit formula, ditto
     }
+
+    @Test
+    public void testMacroinXls() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_macro.xls");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Dirty()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("dirty dirt dirt", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 0aef289..7e68ce8 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -16,19 +16,21 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import static org.junit.Assert.assertEquals;
+
 import java.io.InputStream;
 import java.util.List;
 import java.util.Locale;
 
-import static org.junit.Assert.assertEquals;
-
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
@@ -248,4 +250,16 @@ public class PowerPointParserTest extends TikaTest {
         assertEquals("application/pdf", 
metadataList.get(2).get(Metadata.CONTENT_TYPE));
         assertEquals("4.pdf", 
metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
     }
+
+    @Test
+    @Ignore("POI 3.15-final not finding any macros in this ppt")
+    public void testMacros() throws  Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testPPT_macros.ppt");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Embolden()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("Sub Italicize()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 8b42ff1..e63a61b 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -34,6 +34,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -520,5 +521,17 @@ public class WordParserTest extends TikaTest {
         assertContains("<a href=\"http://tika.apache.org/\";>hyper 
<b>link</b></a>", xml);
         assertContains("<a href=\"http://tika.apache.org/\";><b>hyper</b> 
link</a>; bold" , xml);
     }
+
+    @Test
+    public void testMacros() throws  Exception {
+        //debug(getRecursiveMetadata("SimpleMacro.doc"));
+        List<Metadata> metadataList = 
getRecursiveMetadata("testWORD_macros.doc");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Embolden()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("Sub Italicize()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index ef9291c..3c67397 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -47,6 +47,7 @@ import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.microsoft.WordParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Ignore;
@@ -577,39 +578,39 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
-        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 
Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
-        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 
column 2", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 
Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 
column 2", content.replaceAll("\\s+", " "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);
-        for(int row=1;row<=3;row++) {
+        for (int row = 1; row <= 3; row++) {
             //assertContains("·\tBullet " + row, content);
             //assertContains("\u00b7\tBullet " + row, content);
             assertContains("Bullet " + row, content);
         }
         assertContains("Here is a numbered list:", content);
-        for(int row=1;row<=3;row++) {
+        for (int row = 1; row <= 3; row++) {
             //assertContains(row + ")\tNumber bullet " + row, content);
             //assertContains(row + ") Number bullet " + row, content);
             // TODO: OOXMLExtractor fails to number the bullets:
             assertContains("Number bullet " + row, content);
         }
 
-        for(int row=1;row<=2;row++) {
-            for(int col=1;col<=3;col++) {
+        for (int row = 1; row <= 2; row++) {
+            for (int col = 1; col <= 3; col++) {
                 assertContains("Row " + row + " Col " + col, content);
             }
         }
 
         assertContains("Keyword1 Keyword2", content);
         assertEquals("Keyword1 Keyword2",
-                     metadata.get(Metadata.KEYWORDS));
+                metadata.get(Metadata.KEYWORDS));
 
         assertContains("Subject is here", content);
         // TODO: Remove subject in Tika 2.0
         assertEquals("Subject is here",
-                     metadata.get(Metadata.SUBJECT));
+                metadata.get(Metadata.SUBJECT));
         assertEquals("Subject is here",
-                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+                metadata.get(OfficeOpenXMLCore.SUBJECT));
 
         assertContains("Suddenly some Japanese text:", content);
         // Special version of (GHQ)
@@ -641,21 +642,21 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("<p>Row 2 column 2</p>", xml);
         assertContains("<p><a href=\"http://tika.apache.org/\";>This is a 
hyperlink</a>", xml);
         assertContains("<p>Here is a list:", xml);
-        for(int row=1;row<=3;row++) {
+        for (int row = 1; row <= 3; row++) {
             //assertContains("·\tBullet " + row, content);
             //assertContains("\u00b7\tBullet " + row, content);
             assertContains("<p>Bullet " + row, xml);
         }
         assertContains("Here is a numbered list:", xml);
-        for(int row=1;row<=3;row++) {
+        for (int row = 1; row <= 3; row++) {
             //assertContains(row + ")\tNumber bullet " + row, content);
             //assertContains(row + ") Number bullet " + row, content);
             // TODO: OOXMLExtractor fails to number the bullets:
             assertContains("<p>Number bullet " + row, xml);
         }
 
-        for(int row=1;row<=2;row++) {
-            for(int col=1;col<=3;col++) {
+        for (int row = 1; row <= 2; row++) {
+            for (int col = 1; col <= 3; col++) {
                 assertContains("Row " + row + " Col " + col, xml);
             }
         }
@@ -667,7 +668,7 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("Subject is here", xml);
         // TODO: Remove subject in Tika 2.0
         assertEquals("Subject is here",
-                     metadata.get(Metadata.SUBJECT));
+                metadata.get(Metadata.SUBJECT));
         assertEquals("Subject is here",
                 metadata.get(OfficeOpenXMLCore.SUBJECT));
 
@@ -1253,7 +1254,7 @@ public class OOXMLParserTest extends TikaTest {
         String xml = getXML("testWORD_boldHyperlink.docx").xml;
         xml = xml.replaceAll("\\s+", " ");
         assertContains("<a href=\"http://tika.apache.org/\";>hyper 
<b>link</b></a>", xml);
-        assertContains("<a href=\"http://tika.apache.org/\";><b>hyper</b> 
link</a>; bold" , xml);
+        assertContains("<a href=\"http://tika.apache.org/\";><b>hyper</b> 
link</a>; bold", xml);
     }
 
     @Test
@@ -1262,6 +1263,39 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("bold", 
getXML("testWORD_totalTimeOutOfRange.docx").xml);
     }
 
+    @Test
+    public void testMacrosInDocm() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testWORD_macros.docm");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Embolden()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("Sub Italicize()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
+    @Test
+    public void testMacrosInPptm() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testPPT_macros.pptm");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Embolden()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("Sub Italicize()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
+    @Test
+    public void testMacroinXlsm() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_macro.xlsm");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Dirty()", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("dirty dirt dirt", 
macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", 
macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-test-resources/src/test/resources/test-documents/testEXCEL_macro.xls
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testEXCEL_macro.xls 
b/tika-test-resources/src/test/resources/test-documents/testEXCEL_macro.xls
new file mode 100644
index 0000000..b97f9b2
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testEXCEL_macro.xls 
differ

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-test-resources/src/test/resources/test-documents/testEXCEL_macro.xlsm
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testEXCEL_macro.xlsm 
b/tika-test-resources/src/test/resources/test-documents/testEXCEL_macro.xlsm
new file mode 100644
index 0000000..d21452b
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testEXCEL_macro.xlsm 
differ

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-test-resources/src/test/resources/test-documents/testPPT_macros.ppt
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testPPT_macros.ppt 
b/tika-test-resources/src/test/resources/test-documents/testPPT_macros.ppt
new file mode 100644
index 0000000..7af9008
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testPPT_macros.ppt 
differ

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-test-resources/src/test/resources/test-documents/testPPT_macros.pptm
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testPPT_macros.pptm 
b/tika-test-resources/src/test/resources/test-documents/testPPT_macros.pptm
new file mode 100644
index 0000000..058a039
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testPPT_macros.pptm 
differ

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-test-resources/src/test/resources/test-documents/testWORD_macros.doc
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testWORD_macros.doc 
b/tika-test-resources/src/test/resources/test-documents/testWORD_macros.doc
new file mode 100644
index 0000000..838d86b
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testWORD_macros.doc 
differ

http://git-wip-us.apache.org/repos/asf/tika/blob/66f43347/tika-test-resources/src/test/resources/test-documents/testWORD_macros.docm
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testWORD_macros.docm 
b/tika-test-resources/src/test/resources/test-documents/testWORD_macros.docm
new file mode 100644
index 0000000..a915310
Binary files /dev/null and 
b/tika-test-resources/src/test/resources/test-documents/testWORD_macros.docm 
differ

Reply via email to