This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-2851 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 89435df8bf49f2c3d1b613583362da278c3c866f Author: tallison <[email protected]> AuthorDate: Wed Oct 9 19:30:25 2019 -0700 Make modifications for future integration of POI 4.1.x. Note: commons-codec started throwing an exception on an embedded base64 encoded doc so I had to make some changes there. TODO: we should figure out if the exception is correct and/or if we should handle that leniently. The wordml file was generated by MSWord and shouldn't cause exceptions in embedded files --- tika-bundle/pom.xml | 17 ++++-- tika-parent/pom.xml | 4 +- tika-parsers/pom.xml | 4 +- .../apache/tika/parser/microsoft/EMFParser.java | 61 ++++++++++++---------- .../apache/tika/parser/microsoft/OfficeParser.java | 2 +- .../apache/tika/parser/microsoft/WMFParser.java | 6 +-- .../tika/parser/microsoft/xml/WordMLParser.java | 23 +++++--- .../test/java/org/apache/tika/TestXXEInXML.java | 57 ++++++++++++++------ .../tika/parser/microsoft/WMFParserTest.java | 5 ++ .../parser/microsoft/xml/XML2003ParserTest.java | 8 +-- 10 files changed, 120 insertions(+), 67 deletions(-) diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml index 4c04b4d..3c4fcc8 100644 --- a/tika-bundle/pom.xml +++ b/tika-bundle/pom.xml @@ -172,6 +172,7 @@ poi|poi-scratchpad| poi-ooxml| poi-ooxml-schemas| + commons-math3| curvesapi| xmlbeans| jackcess| @@ -279,6 +280,18 @@ opennlp.tools.namefind;resolution:=optional, opennlp.tools.authorage;resolution:=optional, net.didion.jwnl;resolution:=optional, + net.sf.saxon;resolution:=optional, + net.sf.saxon.dom;resolution:=optional, + net.sf.saxon.om;resolution:=optional, + net.sf.saxon.query;resolution:=optional, + net.sf.saxon.sxpath;resolution:=optional, + net.sf.saxon.value;resolution:=optional, + org.apache.batik.anim.dom;resolution:=optional, + org.apache.batik.bridge;resolution:=optional, + org.apache.batik.ext.awt;resolution:=optional, + org.apache.batik.ext.awt.image.renderable;resolution:=optional, + org.apache.batik.gvt;resolution:=optional, + org.apache.batik.util;resolution:=optional, org.apache.cxf.jaxrs.client;resolution:=optional, org.apache.cxf.jaxrs.ext.multipart;resolution:=optional, org.apache.commons.exec;resolution:=optional, @@ -289,10 +302,6 @@ org.apache.commons.httpclient.params;resolution:=optional, org.apache.commons.httpclient.protocol;resolution:=optional, org.apache.commons.httpclient.util;resolution:=optional, - - org.apache.commons.math3.exception;resolution:=optional, - org.apache.commons.math3.linear;resolution:=optional, - org.apache.commons.math3.stat.regression;resolution:=optional, org.apache.commons.vfs2;resolution:=optional, org.apache.commons.vfs2.provider;resolution:=optional, org.apache.commons.vfs2.util;resolution:=optional, diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 7c239aa..1c2cf91 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -334,8 +334,8 @@ <maven.shade.version>3.2.1</maven.shade.version> <rat.version>0.13</rat.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parsers --> - <poi.version>4.0.1</poi.version> - <commons.compress.version>1.18</commons.compress.version> + <poi.version>4.1.1-SNAPSHOT</poi.version> + <commons.compress.version>1.19</commons.compress.version> <commons.io.version>2.6</commons.io.version> <commons.lang3.version>3.9</commons.lang3.version> <gson.version>2.8.5</gson.version> diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 98bfbe1..ab8a769 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -36,7 +36,7 @@ <properties> <!-- NOTE: sync codec version with POI --> - <codec.version>1.12</codec.version> + <codec.version>1.13</codec.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> <tukaani.version>1.8</tukaani.version> <!-- NOTE: sync brotli version with commons-compress in tika-parent--> @@ -78,7 +78,7 @@ <scope>test</scope> </dependency> - <!-- for java 10 + <!-- for java 10 See TIKA-2778 for why we need to do this now. May the gods of API design fix this in the future. --> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java index 6c967e3..6f28648 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java @@ -22,13 +22,11 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; -import org.apache.poi.hemf.extractor.HemfExtractor; -import org.apache.poi.hemf.record.AbstractHemfComment; -import org.apache.poi.hemf.record.HemfCommentPublic; -import org.apache.poi.hemf.record.HemfCommentRecord; -import org.apache.poi.hemf.record.HemfRecord; -import org.apache.poi.hemf.record.HemfRecordType; -import org.apache.poi.hemf.record.HemfText; +import org.apache.poi.hemf.record.emf.HemfComment; +import org.apache.poi.hemf.record.emf.HemfRecord; +import org.apache.poi.hemf.record.emf.HemfRecordType; +import org.apache.poi.hemf.record.emf.HemfText; +import org.apache.poi.hemf.usermodel.HemfPicture; import org.apache.poi.util.RecordFormatException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -74,41 +72,46 @@ public class EMFParser extends AbstractParser { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { - HemfExtractor ex = new HemfExtractor(stream); - long lastY = -1; - long lastX = -1; + HemfPicture ex = new HemfPicture(stream); + double lastY = -1; + double lastX = -1; long fudgeFactorX = 1000;//derive this from the font or frame/bounds information StringBuilder buffer = new StringBuilder(); for (HemfRecord record : ex) { - if (record.getRecordType() == HemfRecordType.comment) { - AbstractHemfComment comment = ((HemfCommentRecord) record).getComment(); - if (comment instanceof HemfCommentPublic.MultiFormats) { + if (record.getEmfRecordType() == HemfRecordType.comment) { + HemfComment.EmfCommentData commentData = ((HemfComment.EmfComment) record).getCommentData(); + if (commentData instanceof HemfComment.EmfCommentDataMultiformats) { if (embeddedDocumentExtractor == null) { embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); } - handleMultiFormats((HemfCommentPublic.MultiFormats)comment, xhtml, embeddedDocumentExtractor); - } else if (comment instanceof HemfCommentPublic.WindowsMetafile) { + handleMultiFormats( + (HemfComment.EmfCommentDataMultiformats)commentData, xhtml, embeddedDocumentExtractor); + } else if (commentData instanceof HemfComment.EmfCommentDataWMF) { if (embeddedDocumentExtractor == null) { embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); } - handleWMF((HemfCommentPublic.WindowsMetafile)comment, xhtml, embeddedDocumentExtractor); + handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(), + xhtml, embeddedDocumentExtractor); } - } else if (record.getRecordType().equals(HemfRecordType.exttextoutw)) { - HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) record; - if (lastY > -1 && lastY != extTextOutW.getY()) { + } else if (record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) { + + HemfText.EmfExtTextOutW extTextOutW = (HemfText.EmfExtTextOutW) record; + //change equality to delta diff; + + if (lastY > -1 && lastY != extTextOutW.getReference().getY()) { xhtml.startElement("p"); xhtml.characters(buffer.toString()); xhtml.endElement("p"); buffer.setLength(0); lastX = -1; } - if (lastX > -1 && extTextOutW.getX() - lastX > fudgeFactorX) { + if (lastX > -1 && extTextOutW.getReference().getX() - lastX > fudgeFactorX) { buffer.append(" "); } String txt = extTextOutW.getText(); buffer.append(txt); - lastY = extTextOutW.getY(); - lastX = extTextOutW.getX(); + lastY = extTextOutW.getReference().getY(); + lastX = extTextOutW.getReference().getX(); } } if (buffer.length() > 0) { @@ -124,12 +127,12 @@ public class EMFParser extends AbstractParser { xhtml.endDocument(); } - private void handleWMF(HemfCommentPublic.WindowsMetafile comment, ContentHandler contentHandler, + private void handleWMF(byte[] bytes, ContentHandler contentHandler, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException { Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { - try (InputStream is = TikaInputStream.get(comment.getWmfInputStream())) { + try (InputStream is = TikaInputStream.get(bytes)) { embeddedDocumentExtractor.parseEmbedded(is, new EmbeddedContentHandler(contentHandler), embeddedMetadata, false); @@ -139,11 +142,13 @@ public class EMFParser extends AbstractParser { } - private void handleMultiFormats(HemfCommentPublic.MultiFormats comment, ContentHandler handler, + private void handleMultiFormats(HemfComment.EmfCommentDataMultiformats commentData, ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, TikaException, SAXException { - for (HemfCommentPublic.HemfMultiFormatsData data : - ((HemfCommentPublic.MultiFormats) comment).getData()) { - handleEmbedded(data.getData(), embeddedDocumentExtractor, handler); + + for (HemfComment.EmfCommentDataFormat dataFormat : + commentData.getFormats()) { + //is this right?! + handleEmbedded(dataFormat.getRawData(), embeddedDocumentExtractor, handler); } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index 756b7fd..95e7ba0 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -137,7 +137,7 @@ public class OfficeParser extends AbstractOfficeParser { //We might consider not bothering to check for macros in root, //if we know we're processing ppt based on content-type identified in metadata - extractMacros(root.getNFileSystem(), xhtml, + extractMacros(root.getFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java index 5343751..82020a9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java @@ -69,18 +69,18 @@ public class WMFParser extends AbstractParser { //this is pure hackery for specifying the font //TODO: do what Graphics does by maintaining the stack, etc.! //This fix should be done within POI - if (record.getRecordType().equals(HwmfRecordType.createFontIndirect)) { + if (record.getWmfRecordType().equals(HwmfRecordType.createFontIndirect)) { HwmfFont font = ((HwmfText.WmfCreateFontIndirect) record).getFont(); charset = (font.getCharset() == null || font.getCharset().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharset().getCharset(); } - if (record.getRecordType().equals(HwmfRecordType.extTextOut)) { + if (record.getWmfRecordType().equals(HwmfRecordType.extTextOut)) { HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) record; xhtml.startElement("p"); xhtml.characters(textOut.getText(charset)); xhtml.endElement("p"); - } else if (record.getRecordType().equals(HwmfRecordType.textOut)) { + } else if (record.getWmfRecordType().equals(HwmfRecordType.textOut)) { HwmfText.WmfTextOut textOut = (HwmfText.WmfTextOut) record; xhtml.startElement("p"); xhtml.characters(textOut.getText(charset)); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java index 6d1ea8e..202db8e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java @@ -86,7 +86,7 @@ public class WordMLParser extends AbstractXML2003Parser { new WordMLHandler(ch), new HyperlinkHandler(ch, WORD_ML_URL), - new PictHandler(ch, + new PictHandler(ch, metadata, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context))); } @@ -180,6 +180,7 @@ public class WordMLParser extends AbstractXML2003Parser { private class PictHandler extends DefaultHandler { final StringBuilder buffer = new StringBuilder(); + final Metadata parentMetadata; final ContentHandler handler; byte[] rawBytes = null; EmbeddedDocumentExtractor embeddedDocumentExtractor; @@ -189,8 +190,10 @@ public class WordMLParser extends AbstractXML2003Parser { String pictSource = null; final Base64 base64 = new Base64(); - public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) { + public PictHandler(ContentHandler handler, Metadata metadata, + EmbeddedDocumentExtractor embeddedDocumentExtractor) { this.handler = handler; + this.parentMetadata = metadata; this.embeddedDocumentExtractor = embeddedDocumentExtractor; } @@ -263,11 +266,17 @@ public class WordMLParser extends AbstractXML2003Parser { handleEmbedded(); } else if (BIN_DATA.equals(localName)) { inBin = false; - rawBytes = base64.decode(buffer.toString()); - //reset - buffer.setLength(0); - - if (! inPict) { + boolean success = false; + try { + rawBytes = base64.decode(buffer.toString()); + success = true; + } catch (IllegalArgumentException e) { + EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); + } finally { + //reset + buffer.setLength(0); + } + if (success && ! inPict) { handleEmbedded(); } } diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java index 5fd3dca..367b3af 100644 --- a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java +++ b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java @@ -16,7 +16,9 @@ */ package org.apache.tika; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; import org.apache.tika.io.IOUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; @@ -28,7 +30,6 @@ import org.apache.tika.utils.XMLReaderUtils; import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; @@ -104,8 +105,9 @@ public class TestXXEInXML extends XMLTestBase { new AutoDetectParser(), new ParseContext()); } + @Test - public void testXMLInZips() throws Exception { + public void testPOIOOXMLs() throws Exception { for (String fileName : new String[]{ "testWORD.docx", "testWORD_1img.docx", @@ -119,15 +121,13 @@ public class TestXXEInXML extends XMLTestBase { "testPPT_2imgs.pptx", "testPPT_comment.pptx", "testPPT_EmbeddedPDF.pptx", - "testPPT_macros.pptm", - "testEPUB.epub" + "testPPT_macros.pptm" }) { - _testOOXML(fileName); + _testPOIOOXMLs(fileName); } } - private void _testOOXML(String fileName) throws Exception { - + private void _testPOIOOXMLs(String fileName) throws Exception { Path originalOOXML = getResourceAsFile("/test-documents/"+fileName).toPath(); Path injected = injectZippedXMLs(originalOOXML, XXE, false); @@ -138,10 +138,13 @@ public class TestXXEInXML extends XMLTestBase { Metadata metadata = new Metadata(); try { p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext); - - } catch (FileNotFoundException e) { - e.printStackTrace(); - fail("problem with: "+fileName + ": "+ e.getMessage()); + } catch (TikaException e) { + Throwable cause = e.getCause(); + if (!(cause instanceof InvalidFormatException)) { + //as of POI 4.1.x + fail("POI should have thrown an IFE complaining about " + + "not being able to read content types part !"); + } } finally { Files.delete(injected); } @@ -166,6 +169,33 @@ public class TestXXEInXML extends XMLTestBase { } @Test + public void testXMLInZips() throws Exception { + for (String fileName : new String[]{ + "testEPUB.epub" + }) { + _testXMLInZips(fileName); + } + } + + private void _testXMLInZips(String fileName) throws Exception { + Path originalOOXML = getResourceAsFile("/test-documents/"+fileName).toPath(); + Path injected = injectZippedXMLs(originalOOXML, XXE, false); + + Parser p = new AutoDetectParser(); + ContentHandler xhtml = new ToHTMLContentHandler(); + ParseContext parseContext = new ParseContext(); + //if the SafeContentHandler is turned off, this will throw an FNFE + Metadata metadata = new Metadata(); + try { + p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext); + } finally { + Files.delete(injected); + } + + } + + + @Test public void testDOM() throws Exception { byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8); byte[] injected = injectXML(bytes, XXE); @@ -207,9 +237,4 @@ public class TestXXEInXML extends XMLTestBase { TikaConfig tikaConfig = new TikaConfig(new ByteArrayInputStream(injected)); } } - - - - - } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java index fb2d631..8d04697 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java @@ -37,6 +37,11 @@ public class WMFParserTest extends TikaTest { testTextExtraction("testWMF_charset.wmf", 0, "普林斯"); } + @Test + public void testOneOff() throws Exception { + debug(getRecursiveMetadata("testWMF-bad.wmf")); + } + private void testTextExtraction(String fileName, int metaDataItemIndex, String expectedText) throws Exception { List<Metadata> metadataList = getRecursiveMetadata(fileName); Metadata wmfMetadata = metadataList.get(metaDataItemIndex); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java index 4e52792..1ee6d06 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java @@ -47,8 +47,7 @@ public class XML2003ParserTest extends MultiThreadedTikaTest { @Test public void testBasicWord() throws Exception { List<Metadata> list = getRecursiveMetadata("testWORD2003.xml"); - assertEquals(8, list.size()); - + assertEquals(6, list.size()); Metadata m = list.get(0);//container doc String xml = m.get(RecursiveParserWrapper.TIKA_CONTENT); xml = xml.replaceAll("\\s+", " "); @@ -81,7 +80,9 @@ public class XML2003ParserTest extends MultiThreadedTikaTest { //make sure embedded docs were properly processed assertContains("moscow-birds", - Arrays.asList(list.get(7).getValues(TikaCoreProperties.KEYWORDS))); + Arrays.asList(list.get(5).getValues(TikaCoreProperties.KEYWORDS))); + + assertEquals("testJPEG_EXIF.jpg", list.get(5).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)); //check that text is extracted with breaks between elements String txt = getText(getResourceAsStream("/test-documents/testWORD2003.xml"), new AutoDetectParser()); @@ -92,7 +93,6 @@ public class XML2003ParserTest extends MultiThreadedTikaTest { assertContains("footnote Figure", txt); assertContains("test space", txt); - assertEquals("testJPEG_EXIF.jpg", list.get(7).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)); } @Test
