This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3164-1.x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7aa27326fcc285c52876a0ae3759d5fe93f62789 Author: tallison <talli...@apache.org> AuthorDate: Tue Feb 23 05:33:26 2021 -0500 TIKA-3164 -- WIP POI 5.0.0 --- tika-parent/pom.xml | 2 +- tika-parsers/pom.xml | 8 ++++++ .../tika/parser/microsoft/OutlookExtractor.java | 4 +-- .../microsoft/ooxml/OOXMLExtractorFactory.java | 30 ++++++++++++++-------- .../ooxml/SXSLFPowerPointExtractorDecorator.java | 4 +-- .../ooxml/XSLFPowerPointExtractorDecorator.java | 18 +++---------- .../microsoft/ooxml/xps/XPSTextExtractor.java | 25 ++++++++++++++++-- .../xslf/XSLFEventBasedPowerPointExtractor.java | 23 +++++++++++++++-- .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 24 +++++++++++++++-- 9 files changed, 101 insertions(+), 37 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 626b1a2..baccf43 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -334,7 +334,7 @@ <maven.shade.version>3.2.4</maven.shade.version> <rat.version>0.13</rat.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parsers --> - <poi.version>4.1.2</poi.version> + <poi.version>5.0.0</poi.version> <commons.compress.version>1.20</commons.compress.version> <commons.io.version>2.8.0</commons.io.version> <commons.lang3.version>3.11</commons.lang3.version> diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index fa5e6f0..53e0a26 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -264,6 +264,14 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-compress</artifactId> </exclusion> + <exclusion> + <groupId>com.fasterxml.woodstox</groupId> + <artifactId>woodstox-core</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.xmlgraphics</groupId> + <artifactId>batik-all</artifactId> + </exclusion> </exclusions> </dependency> <dependency> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index c2e27d6..b8c3d66 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -700,8 +700,8 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { for (RecipientChunks chunks : recipientChunks) { Recipient r = new Recipient(); - r.displayName = (chunks.recipientDisplayNameChunk != null) ? chunks.recipientDisplayNameChunk.toString() : null; - r.name = (chunks.recipientNameChunk != null) ? chunks.recipientNameChunk.toString() : null; + r.displayName = (chunks.getRecipientDisplayNameChunk() != null) ? chunks.getRecipientDisplayNameChunk().toString() : null; + r.name = (chunks.getRecipientNameChunk() != null) ? chunks.getRecipientNameChunk().toString() : null; r.emailAddress = chunks.getRecipientEmailAddress(); List<PropertyValue> vals = chunks.getProperties().get(MAPIProperty.RECIPIENT_TYPE); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 15f2c33..e2dc17e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -22,9 +22,8 @@ import java.io.IOException; import java.io.InputStream; import java.util.Locale; -import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.poi.ooxml.POIXMLDocument; -import org.apache.poi.ooxml.extractor.ExtractorFactory; +import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.InvalidOperationException; @@ -34,10 +33,9 @@ import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.util.LocaleUtil; -import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xslf.extractor.XSLFExtractor; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFRelation; -import org.apache.poi.xslf.usermodel.XSLFSlideShow; import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; @@ -64,6 +62,8 @@ import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import static org.apache.poi.ooxml.extractor.POIXMLExtractorFactory.setThreadPrefersEventExtractors; + /** * Figures out the correct {@link OOXMLExtractor} for the supplied document and * returns it. @@ -72,13 +72,17 @@ public class OOXMLExtractorFactory { private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class); private static final int MAX_BUFFER_LENGTH = 1000000; + private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory(); + + static { + setThreadPrefersEventExtractors(true); + } public static void parse( InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale()); - ExtractorFactory.setThreadPrefersEventExtractors(true); //if there's a problem opening the zip file; //create a tmp file, and copy what you can read of it. @@ -167,7 +171,7 @@ public class OOXMLExtractorFactory { } if (poiExtractor == null) { - poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(pkg); + poiExtractor = EXTRACTOR_FACTORY.create(pkg); } POIXMLDocument document = poiExtractor.getDocument(); @@ -192,8 +196,8 @@ public class OOXMLExtractorFactory { "The extractor returned was a " + poiExtractor ); } else if (document instanceof XMLSlideShow) { - extractor = new XSLFPowerPointExtractorDecorator( - context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor); + extractor = new XSLFPowerPointExtractorDecorator( metadata, + context, (org.apache.poi.xslf.extractor.XSLFExtractor) poiExtractor); } else if (document instanceof XWPFDocument) { extractor = new XWPFWordExtractorDecorator( metadata, context, (XWPFWordExtractor) poiExtractor); @@ -279,7 +283,11 @@ public class OOXMLExtractorFactory { } String targetContentType = corePart.getContentType(); - XSLFRelation[] xslfRelations = org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES; + //TODO make this static...or find what happened to SUPPORTED_TYPES + XSLFRelation[] xslfRelations = new XSLFRelation[] { + XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE, + XSLFRelation.PRESENTATIONML_TEMPLATE + }; for (int i = 0; i < xslfRelations.length; i++) { XSLFRelation xslfRelation = xslfRelations[i]; @@ -287,7 +295,7 @@ public class OOXMLExtractorFactory { if (eventBased) { return new XSLFEventBasedPowerPointExtractor(pkg); } else { - return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); + return new XSLFExtractor(new XMLSlideShow(pkg)); } } } @@ -296,7 +304,7 @@ public class OOXMLExtractorFactory { if (eventBased) { return new XSLFEventBasedPowerPointExtractor(pkg); } else { - return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); + return new XSLFExtractor(new XMLSlideShow(pkg)); } } return null; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java index ac6e278..5350f30 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java @@ -33,7 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.PackagingURIHelper; import org.apache.poi.openxml4j.opc.TargetMode; -import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xslf.extractor.XSLFExtractor; import org.apache.poi.xslf.usermodel.XSLFRelation; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -95,7 +95,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { } /** - * @see XSLFPowerPointExtractor#getText() + * @see XSLFExtractor#getText() */ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java index c63fcb3..9b61d68 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java @@ -30,13 +30,11 @@ import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.PackagingURIHelper; import org.apache.poi.openxml4j.opc.TargetMode; -import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.usermodel.Placeholder; -import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xslf.extractor.XSLFExtractor; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFComment; import org.apache.poi.xslf.usermodel.XSLFCommentAuthors; -import org.apache.poi.xslf.usermodel.XSLFComments; import org.apache.poi.xslf.usermodel.XSLFGraphicFrame; import org.apache.poi.xslf.usermodel.XSLFGroupShape; import org.apache.poi.xslf.usermodel.XSLFHyperlink; @@ -73,23 +71,13 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { private Metadata metadata; - public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context, XSLFPowerPointExtractor extractor) { + public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context, XSLFExtractor extractor) { super(context, extractor); this.metadata = metadata; } /** - * use {@link XSLFPowerPointExtractorDecorator#XSLFPowerPointExtractorDecorator(Metadata, ParseContext, XSLFPowerPointExtractor)} - * @param context - * @param extractor - */ - @Deprecated - public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) { - this(new Metadata(),context, extractor); - } - - /** - * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText() + * @see org.apache.poi.xslf.extractor.XSLFExtractor#getText() */ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException { XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument(); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java index 0212920..a590d39 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java @@ -25,6 +25,7 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.xmlbeans.XmlException; +import java.io.Closeable; import java.io.IOException; /** @@ -32,13 +33,12 @@ import java.io.IOException; * and keep the general framework similar to our other POI-integrated * extractors. */ -public class XPSTextExtractor extends POIXMLTextExtractor { +public class XPSTextExtractor implements POIXMLTextExtractor { private final OPCPackage pkg; private final POIXMLProperties properties; public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException { - super((POIXMLDocument)null); this.pkg = pkg; this.properties = new POIXMLProperties(pkg); @@ -53,6 +53,22 @@ public class XPSTextExtractor extends POIXMLTextExtractor { public String getText() { return null; } + + @Override + public void setCloseFilesystem(boolean b) { + + } + + @Override + public boolean isCloseFilesystem() { + return false; + } + + @Override + public Closeable getFilesystem() { + return null; + } + public POIXMLProperties.CoreProperties getCoreProperties() { return this.properties.getCoreProperties(); } @@ -64,4 +80,9 @@ public class XPSTextExtractor extends POIXMLTextExtractor { public POIXMLProperties.CustomProperties getCustomProperties() { return this.properties.getCustomProperties(); } + + @Override + public POIXMLDocument getDocument() { + return null; + } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java index bd5615d..4f666a7 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java @@ -17,6 +17,7 @@ package org.apache.tika.parser.microsoft.ooxml.xslf; +import java.io.Closeable; import java.io.IOException; import java.util.Date; @@ -31,7 +32,7 @@ import org.apache.tika.parser.microsoft.ooxml.RunProperties; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; import org.apache.xmlbeans.XmlException; -public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor { +public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor { private OPCPackage container; @@ -42,7 +43,6 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor { } public XSLFEventBasedPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { - super((POIXMLDocument) null); this.container = container; this.properties = new POIXMLProperties(container); } @@ -76,6 +76,11 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor { return this.properties.getCustomProperties(); } + @Override + public POIXMLDocument getDocument() { + return null; + } + @Override public String getText() { @@ -83,6 +88,20 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor { return ""; } + @Override + public void setCloseFilesystem(boolean b) { + + } + + @Override + public boolean isCloseFilesystem() { + return false; + } + + @Override + public Closeable getFilesystem() { + return null; + } private class XSLFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index 866bb78..d2ef6db 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf; import javax.xml.parsers.ParserConfigurationException; +import java.io.Closeable; import java.io.IOException; import java.io.InputStream; import java.util.Date; @@ -54,7 +55,7 @@ import org.xml.sax.XMLReader; * Experimental class that is based on POI's XSSFEventBasedExcelExtractor * */ -public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor { +public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { private static final Logger LOG = LoggerFactory.getLogger(XWPFEventBasedWordExtractor.class); @@ -66,7 +67,6 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor { } public XWPFEventBasedWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { - super((POIXMLDocument) null); this.container = container; this.properties = new POIXMLProperties(container); } @@ -100,6 +100,11 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor { return this.properties.getCustomProperties(); } + @Override + public POIXMLDocument getDocument() { + return null; + } + @Override public String getText() { @@ -139,6 +144,21 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor { return sb.toString(); } + @Override + public void setCloseFilesystem(boolean b) { + + } + + @Override + public boolean isCloseFilesystem() { + return false; + } + + @Override + public Closeable getFilesystem() { + return null; + } + private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) throws IOException, SAXException { //load the numbering/list manager and styles from the main document part