Repository: tika Updated Branches: refs/heads/2.x a38c4271e -> 84bf06285
TIKA-1910 - Make Web and Package optional in Office. Remove POI from package parser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/84bf0628 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/84bf0628 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/84bf0628 Branch: refs/heads/2.x Commit: 84bf06285451be911a7c711856634a69733a47a1 Parents: a38c427 Author: Bob Paulin <[email protected]> Authored: Sat Mar 26 21:22:15 2016 -0500 Committer: Bob Paulin <[email protected]> Committed: Sat Mar 26 21:22:15 2016 -0500 ---------------------------------------------------------------------- .../module/BundleIT.java | 89 ----------- .../tika-parser-advanced-bundle/pom.xml | 1 - .../tika-parser-office-bundle/pom.xml | 3 + .../tika-parser-package-bundle/pom.xml | 58 +------ .../tika-parser-database-module/pom.xml | 6 + .../tika-parser-office-module/pom.xml | 7 + .../org/apache/tika/parser/chm/ChmParser.java | 11 +- .../microsoft/AbstractPOIFSExtractor.java | 7 +- .../parser/microsoft/JackcessExtractor.java | 8 +- .../tika/parser/microsoft/OutlookExtractor.java | 24 +-- .../microsoft/ooxml/OOXMLExtractorFactory.java | 4 +- .../org/apache/tika/parser/opc/OPCDetector.java | 155 +++++++++++++++++++ .../tika-parser-package-module/pom.xml | 23 +-- .../tika/parser/pkg/ZipContainerDetector.java | 113 ++------------ 14 files changed, 225 insertions(+), 284 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java b/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java deleted file mode 100644 index c446ee8..0000000 --- a/tika-parser-bundles/tika-parser-advanced-bundle/module/BundleIT.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.module; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertNotNull; -import static org.ops4j.pax.exam.CoreOptions.bundle; -import static org.ops4j.pax.exam.CoreOptions.junitBundles; -import static org.ops4j.pax.exam.CoreOptions.options; -import static org.ops4j.pax.exam.CoreOptions.mavenBundle; - -import javax.inject.Inject; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.StringWriter; -import java.io.Writer; -import java.net.URISyntaxException; -import java.util.Dictionary; - -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.mime.MediaType; -import org.apache.tika.osgi.TikaService; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.BodyContentHandler; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.ops4j.pax.exam.Configuration; -import org.ops4j.pax.exam.Option; -import org.ops4j.pax.exam.junit.PaxExam; -import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy; -import org.ops4j.pax.exam.spi.reactors.PerMethod; -import org.osgi.framework.Bundle; -import org.osgi.framework.BundleContext; -import org.xml.sax.ContentHandler; - -@RunWith(PaxExam.class) -@ExamReactorStrategy(PerMethod.class) -public class BundleIT { - - private static final String BUNDLE_JAR_SYS_PROP = "project.bundle.file"; - - @Inject - private BundleContext bc; - - @Configuration - public Option[] configuration() throws IOException, URISyntaxException { - String bundleFileName = System.getProperty(BUNDLE_JAR_SYS_PROP); - - return options(junitBundles(), - bundle(new File("target/test-bundles/tika-core.jar").toURI().toURL().toString()), - bundle(new File(bundleFileName).toURI().toString())); - } - - @Test - public void testBundleLoaded() throws Exception { - boolean hasCore = false, hasBundle = false; - for (Bundle b : bc.getBundles()) { - if ("org.apache.tika.core".equals(b.getSymbolicName())) { - hasCore = true; - assertEquals("Core not activated", Bundle.ACTIVE, b.getState()); - } - if ("org.apache.tika.parser-advanced-module".equals(b.getSymbolicName())) { - hasBundle = true; - assertEquals("Bundle not activated", Bundle.ACTIVE, b.getState()); - } - } - assertTrue("Core bundle not found", hasCore); - assertTrue("Advanced bundle not found", hasBundle); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml index 2339483..28713fa 100644 --- a/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-advanced-bundle/pom.xml @@ -63,7 +63,6 @@ *, opennlp.maxent;resolution:=optional, opennlp.tools.namefind;resolution:=optional, - org.apache.commons.io;resolution:=optional, org.json;resolution:=optional, org.osgi.framework;resolution:=optional, net.didion.jwnl;resolution:=optional http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-office-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-office-bundle/pom.xml b/tika-parser-bundles/tika-parser-office-bundle/pom.xml index cd6ef7f..c9db0da 100644 --- a/tika-parser-bundles/tika-parser-office-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-office-bundle/pom.xml @@ -82,6 +82,7 @@ org.apache.tika.parser.mbox.*, org.apache.tika.parser.microsoft.*, org.apache.tika.parser.microsoft.ooxml.*, + org.apache.tika.parser.opc.*, org.apache.tika.parser.odf.*, org.apache.tika.parser.opendocument.*, org.apache.tika.parser.rtf.* @@ -122,6 +123,8 @@ org.etsi.uri.x01903.v14;resolution:=optional, org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional, org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional, + org.apache.tika.parser.html.HtmlParser;resolution:=optional, + org.apache.tika.parser.pkg.ZipContainerDetector;resolution:=optional </Import-Package> </instructions> </configuration> http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-bundles/tika-parser-package-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-package-bundle/pom.xml b/tika-parser-bundles/tika-parser-package-bundle/pom.xml index bbd917f..4d292d7 100644 --- a/tika-parser-bundles/tika-parser-package-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-package-bundle/pom.xml @@ -50,13 +50,8 @@ commons-io;inline=true, commons-codec;inline=true, xz;inline=true, - poi;inline=true, - poi-ooxml;inline=true, - poi-ooxml-schemas;inline=true, - xmlbeans;inline=true, commons-compress;inline=true, - junrar;inline=true, - curvesapi;inline=true + junrar;inline=true </Embed-Dependency> <Embed-Transitive>true</Embed-Transitive> <Export-Package> @@ -64,60 +59,11 @@ org.apache.tika.parser.iwork.* </Export-Package> <Import-Package> - !org.junit, - !org.junit.*, - !junit.*, *, - com.microsoft.schemas.office.powerpoint;resolution:=optional, - com.microsoft.schemas.office.word;resolution:=optional, org.apache.commons.vfs2;resolution:=optional, org.apache.commons.vfs2.provider;resolution:=optional, org.apache.commons.vfs2.util;resolution:=optional, - org.apache.crimson.jaxp;resolution:=optional, - org.apache.jcp.xml.dsig.internal.dom;resolution:=optional, - org.apache.xml.resolver;resolution:=optional, - org.apache.xml.resolver.tools;resolution:=optional, - org.apache.xml.security;resolution:=optional, - org.apache.xml.security.c14n;resolution:=optional, - org.apache.xml.security.utils;resolution:=optional, - org.apache.xmlbeans.impl.xpath.saxon;resolution:=optional, - org.apache.xmlbeans.impl.xquery.saxon;resolution:=optional, - com.sun.javadoc;resolution:=optional, - com.sun.xml.bind.marshaller;resolution:=optional, - com.sun.xml.internal.bind.marshaller;resolution:=optional, - com.sun.msv.datatype;resolution:=optional, - com.sun.msv.datatype.xsd;resolution:=optional, - com.sun.tools.javadoc;resolution:=optional, - org.apache.poi.hdgf.extractor;resolution:=optional, - org.apache.poi.hpbf.extractor;resolution:=optional, - org.apache.poi.hslf.blip;resolution:=optional, - org.apache.poi.hslf.extractor;resolution:=optional, - org.apache.poi.hsmf;resolution:=optional, - org.apache.poi.hsmf.datatypes;resolution:=optional, - org.apache.poi.hsmf.extractor;resolution:=optional, - org.apache.poi.hwpf;resolution:=optional, - org.apache.poi.hwpf.extractor;resolution:=optional, - org.apache.tools.ant;resolution:=optional, - org.apache.tools.ant.taskdefs;resolution:=optional, - org.apache.tools.ant.types;resolution:=optional, - org.bouncycastle.asn1;resolution:=optional, - org.bouncycastle.asn1.cmp;resolution:=optional, - org.bouncycastle.asn1.nist;resolution:=optional, - org.bouncycastle.asn1.ocsp;resolution:=optional, - org.bouncycastle.asn1.x500;resolution:=optional, - org.bouncycastle.asn1.x509;resolution:=optional, - org.bouncycastle.cert;resolution:=optional, - org.bouncycastle.cert.jcajce;resolution:=optional, - org.bouncycastle.cert.ocsp;resolution:=optional, - org.bouncycastle.cms;resolution:=optional, - org.bouncycastle.cms.bc;resolution:=optional, - org.bouncycastle.operator;resolution:=optional, - org.bouncycastle.operator.bc;resolution:=optional, - org.bouncycastle.tsp;resolution:=optional, - org.bouncycastle.util;resolution:=optional, - org.etsi.uri.x01903.v14;resolution:=optional, - org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional, - org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional, + </Import-Package> </instructions> </configuration> http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-database-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-database-module/pom.xml b/tika-parser-modules/tika-parser-database-module/pom.xml index fd47f3d..a60dae3 100644 --- a/tika-parser-modules/tika-parser-database-module/pom.xml +++ b/tika-parser-modules/tika-parser-database-module/pom.xml @@ -47,6 +47,12 @@ <version>${project.version}</version> <scope>test</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-package-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/pom.xml b/tika-parser-modules/tika-parser-office-module/pom.xml index 3a8e5d2..689c133 100644 --- a/tika-parser-modules/tika-parser-office-module/pom.xml +++ b/tika-parser-modules/tika-parser-office-module/pom.xml @@ -30,6 +30,11 @@ <version>${project.version}</version> </dependency> <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> + </dependency> + <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>${poi.version}</version> @@ -73,11 +78,13 @@ <groupId>${project.groupId}</groupId> <artifactId>tika-parser-package-module</artifactId> <version>${project.version}</version> + <scope>test</scope> </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-parser-web-module</artifactId> <version>${project.version}</version> + <scope>test</scope> </dependency> <dependency> <groupId>${project.groupId}</groupId> http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java index 7c43995..c3e85c1 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/ChmParser.java @@ -29,9 +29,10 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserProxy; import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; import org.apache.tika.parser.chm.core.ChmExtractor; -import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; @@ -49,6 +50,11 @@ public class ChmParser extends AbstractParser { MediaType.application("chm"), MediaType.application("x-chm")))); + private final Parser htmlProxy; + + public ChmParser() { + this.htmlProxy = createParserProxy("org.apache.tika.parser.html.HtmlParser"); + } @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -91,12 +97,11 @@ public class ChmParser extends AbstractParser { private void parsePage(byte[] byteObject, ContentHandler xhtml) throws TikaException {// throws IOException InputStream stream = null; Metadata metadata = new Metadata(); - HtmlParser htmlParser = new HtmlParser(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 ParseContext parser = new ParseContext(); try { stream = new ByteArrayInputStream(byteObject); - htmlParser.parse(stream, handler, metadata, parser); + htmlProxy.parse(stream, handler, metadata, parser); } catch (SAXException e) { throw new RuntimeException(e); } catch (IOException e) { http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index e2acb52..320cc74 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -31,6 +31,7 @@ import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.hpsf.ClassID; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; +import org.apache.tika.detect.DetectorProxy; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; @@ -43,7 +44,6 @@ import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; -import org.apache.tika.parser.pkg.ZipContainerDetector; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; @@ -55,6 +55,7 @@ abstract class AbstractPOIFSExtractor { private MimeTypes mimeTypes; private Detector detector; private Metadata metadata; + private final Detector zipDetectorProxy; protected AbstractPOIFSExtractor(ParseContext context) { this(context, null); @@ -74,6 +75,7 @@ abstract class AbstractPOIFSExtractor { this.mimeTypes = context.get(MimeTypes.class); this.detector = context.get(Detector.class); this.metadata = metadata; + this.zipDetectorProxy = new DetectorProxy("org.apache.tika.parser.pkg.ZipContainerDetector", getClass().getClassLoader()); } // Note - these cache, but avoid creating the default TikaConfig if not needed @@ -159,8 +161,7 @@ abstract class AbstractPOIFSExtractor { try (TikaInputStream stream = TikaInputStream.get( new DocumentInputStream((DocumentEntry) ooxml))) { - ZipContainerDetector detector = new ZipContainerDetector(); - MediaType type = detector.detect(stream, new Metadata()); + MediaType type = zipDetectorProxy.detect(stream, new Metadata()); handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true); return; } http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java index e224d54..345dd24 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java @@ -47,7 +47,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserProxy; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; @@ -71,10 +72,11 @@ class JackcessExtractor extends AbstractPOIFSExtractor { final NumberFormat currencyFormatter; final DateFormat shortDateTimeFormatter; - final HtmlParser htmlParser = new HtmlParser(); + private final Parser htmlParserProxy; protected JackcessExtractor(ParseContext context, Locale locale) { super(context); + this.htmlParserProxy = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader()); currencyFormatter = NumberFormat.getCurrencyInstance(locale); shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale); } @@ -200,7 +202,7 @@ class JackcessExtractor extends AbstractPOIFSExtractor { Metadata m = new Metadata(); m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); try { - htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)), + htmlParserProxy.parse(new ByteArrayInputStream(v.getBytes(UTF_8)), h, m, EMPTY_PARSE_CONTEXT); handler.characters(h.toString()); http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 3a85882..108d5eb 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -23,6 +23,7 @@ import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.Locale; @@ -44,14 +45,15 @@ import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.util.CodePageUtil; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.detect.EncodingDetectorProxy; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.html.HtmlEncodingDetector; -import org.apache.tika.parser.html.HtmlParser; -import org.apache.tika.parser.mbox.MboxParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserProxy; import org.apache.tika.parser.rtf.RTFParser; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; @@ -67,7 +69,9 @@ import static java.nio.charset.StandardCharsets.UTF_8; */ public class OutlookExtractor extends AbstractPOIFSExtractor { private static final Metadata EMPTY_METADATA = new Metadata(); - HtmlEncodingDetector detector = new HtmlEncodingDetector(); + private final SimpleDateFormat dateFormat; + private final EncodingDetector htmlEncodingDetectorProxy; + private final Parser htmlParserProxy; private final MAPIMessage msg; @@ -77,7 +81,9 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException { super(context); - + this.htmlEncodingDetectorProxy = new EncodingDetectorProxy("org.apache.tika.parser.html.HtmlEncodingDetector", getClass().getClassLoader()); + this.htmlParserProxy = new ParserProxy("org.apache.tika.parser.html.HtmlParser", getClass().getClassLoader()); + this.dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US); try { this.msg = new MAPIMessage(root); } catch (IOException e) { @@ -135,7 +141,8 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { // See if we can parse it as a normal mail date try { - Date d = MboxParser.parseDate(date); + + Date d = dateFormat.parse(date); metadata.set(TikaCoreProperties.CREATED, d); metadata.set(TikaCoreProperties.MODIFIED, d); } catch (ParseException e) { @@ -196,8 +203,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { data = ((StringChunk) htmlChunk).getRawValue(); } if (data != null) { - HtmlParser htmlParser = new HtmlParser(); - htmlParser.parse( + htmlParserProxy.parse( new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext() @@ -341,7 +347,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { if(html != null && html.length() > 0) { Charset charset = null; try { - charset = detector.detect(new ByteArrayInputStream( + charset = htmlEncodingDetectorProxy.detect(new ByteArrayInputStream( html.getBytes(UTF_8)), EMPTY_METADATA); } catch (IOException e) { //swallow http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index add1f2c..518a000 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -39,7 +39,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.pkg.ZipContainerDetector; +import org.apache.tika.parser.opc.OPCDetector; import org.apache.xmlbeans.XmlException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -74,7 +74,7 @@ public class OOXMLExtractorFactory { } // Get the type, and ensure it's one we handle - MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg); + MediaType type = OPCDetector.detectOfficeOpenXML(pkg); if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) { // Not a supported type, delegate to Empty Parser EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context); http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java new file mode 100644 index 0000000..cc17459 --- /dev/null +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/opc/OPCDetector.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.opc; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Locale; +import java.util.regex.Pattern; + +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackageAccess; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; +import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; +import org.apache.tika.detect.Detector; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; + +/** + * Detector that detects OPC Packages + * + */ +public class OPCDetector implements Detector { + + /** + * + */ + private static final long serialVersionUID = -3569622763024617244L; + + private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE); + + // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes + private static final String VISIO_DOCUMENT = + "http://schemas.microsoft.com/visio/2010/relationships/document"; + + // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes + private static final String STRICT_CORE_DOCUMENT = + "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument"; + + @Override + public MediaType detect(InputStream input, Metadata metadata) throws IOException { + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream stream = TikaInputStream.get(input, tmp); + // Use POI to open and investigate it for us + OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ); + stream.setOpenContainer(pkg); + + // Is at an OOXML format? + MediaType type = detectOfficeOpenXML(pkg); + if (type != null) return type; + + // Is it XPS format? + type = detectXPSOPC(pkg); + if (type != null) return type; + + // Is it an AutoCAD format? + type = detectAutoCADOPC(pkg); + + return type; + } catch (InvalidFormatException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + }finally { + tmp.close(); + } + return null; + } + + /** + * Detects the type of an OfficeOpenXML (OOXML) file from + * opened Package + */ + public static MediaType detectOfficeOpenXML(OPCPackage pkg) { + // Check for the normal Office core document + PackageRelationshipCollection core = + pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT); + // Otherwise check for some other Office core document types + if (core.size() == 0) { + core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT); + } + if (core.size() == 0) { + core = pkg.getRelationshipsByType(VISIO_DOCUMENT); + } + + // If we didn't find a single core document of any type, skip detection + if (core.size() != 1) { + // Invalid OOXML Package received + return null; + } + + // Get the type of the core document part + PackagePart corePart = pkg.getPart(core.getRelationship(0)); + String coreType = corePart.getContentType(); + + // Turn that into the type of the overall document + String docType = coreType.substring(0, coreType.lastIndexOf('.')); + + // The Macro Enabled formats are a little special + if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) { + docType = docType.toLowerCase(Locale.ROOT) + ".12"; + } + + if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) { + docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12"); + } + + // Build the MediaType object and return + return MediaType.parse(docType); + } + /** + * Detects Open XML Paper Specification (XPS) + */ + private static MediaType detectXPSOPC(OPCPackage pkg) { + PackageRelationshipCollection xps = + pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"); + if (xps.size() == 1) { + return MediaType.application("vnd.ms-xpsdocument"); + } else { + // Non-XPS Package received + return null; + } + } + /** + * Detects AutoCAD formats that live in OPC packaging + */ + private static MediaType detectAutoCADOPC(OPCPackage pkg) { + PackageRelationshipCollection dwfxSeq = + pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence"); + if (dwfxSeq.size() == 1) { + return MediaType.parse("model/vnd.dwfx+xps"); + } else { + // Non-AutoCAD Package received + return null; + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-package-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-package-module/pom.xml b/tika-parser-modules/tika-parser-package-module/pom.xml index 2e226d2..8d1238d 100644 --- a/tika-parser-modules/tika-parser-package-module/pom.xml +++ b/tika-parser-modules/tika-parser-package-module/pom.xml @@ -35,21 +35,6 @@ <version>${project.version}</version> </dependency> <dependency> - <groupId>org.apache.poi</groupId> - <artifactId>poi-ooxml</artifactId> - <version>${poi.version}</version> - <exclusions> - <exclusion> - <groupId>stax</groupId> - <artifactId>stax-api</artifactId> - </exclusion> - <exclusion> - <groupId>xml-apis</groupId> - <artifactId>xml-apis</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> <groupId>org.tukaani</groupId> <artifactId>xz</artifactId> <version>${tukaani.version}</version> @@ -69,13 +54,17 @@ <artifactId>commons-compress</artifactId> <version>${commons.compress.version}</version> </dependency> - <dependency> + <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-parser-text-module</artifactId> <version>${project.version}</version> <scope>test</scope> </dependency> - + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>${codec.version}</version> + </dependency> </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/84bf0628/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java index 9ca6729..8276e9a 100644 --- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java +++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java @@ -22,7 +22,6 @@ import java.io.InputStream; import java.util.Enumeration; import java.util.HashSet; import java.util.Iterator; -import java.util.Locale; import java.util.Set; import java.util.regex.Pattern; @@ -36,12 +35,7 @@ import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorInputStream; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.io.IOUtils; -import org.apache.poi.openxml4j.exceptions.InvalidFormatException; -import org.apache.poi.openxml4j.opc.OPCPackage; -import org.apache.poi.openxml4j.opc.PackageAccess; -import org.apache.poi.openxml4j.opc.PackagePart; -import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; -import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; +import org.apache.tika.detect.AbstractDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; @@ -57,18 +51,16 @@ import static java.nio.charset.StandardCharsets.UTF_8; * A detector that works on Zip documents and other archive and compression * formats to figure out exactly what the file is. */ -public class ZipContainerDetector implements Detector { - private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE); +public class ZipContainerDetector extends AbstractDetector { - // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes - private static final String VISIO_DOCUMENT = - "http://schemas.microsoft.com/visio/2010/relationships/document"; - // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes - private static final String STRICT_CORE_DOCUMENT = - "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument"; - /** Serial version UID */ private static final long serialVersionUID = 2891763938430295453L; + + private final Detector opcDetector; + + public ZipContainerDetector() { + this.opcDetector = createDetectorProxy("org.apache.tika.parser.opc.OPCDetector"); + } public MediaType detect(InputStream input, Metadata metadata) throws IOException { @@ -138,7 +130,7 @@ public class ZipContainerDetector implements Detector { } } - private static MediaType detectZipFormat(TikaInputStream tis) { + private MediaType detectZipFormat(TikaInputStream tis) { try { ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()? try { @@ -199,24 +191,11 @@ public class ZipContainerDetector implements Detector { } } - private static MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) { + private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) { try { if (zip.getEntry("_rels/.rels") != null || zip.getEntry("[Content_Types].xml") != null) { - // Use POI to open and investigate it for us - OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ); - stream.setOpenContainer(pkg); - - // Is at an OOXML format? - MediaType type = detectOfficeOpenXML(pkg); - if (type != null) return type; - - // Is it XPS format? - type = detectXPSOPC(pkg); - if (type != null) return type; - - // Is it an AutoCAD format? - type = detectAutoCADOPC(pkg); + MediaType type = this.opcDetector.detect(stream, null); if (type != null) return type; // We don't know what it is, sorry @@ -228,77 +207,9 @@ public class ZipContainerDetector implements Detector { return null; } catch (RuntimeException e) { return null; - } catch (InvalidFormatException e) { - return null; - } - } - /** - * Detects the type of an OfficeOpenXML (OOXML) file from - * opened Package - */ - public static MediaType detectOfficeOpenXML(OPCPackage pkg) { - // Check for the normal Office core document - PackageRelationshipCollection core = - pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT); - // Otherwise check for some other Office core document types - if (core.size() == 0) { - core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT); - } - if (core.size() == 0) { - core = pkg.getRelationshipsByType(VISIO_DOCUMENT); - } - - // If we didn't find a single core document of any type, skip detection - if (core.size() != 1) { - // Invalid OOXML Package received - return null; - } - - // Get the type of the core document part - PackagePart corePart = pkg.getPart(core.getRelationship(0)); - String coreType = corePart.getContentType(); - - // Turn that into the type of the overall document - String docType = coreType.substring(0, coreType.lastIndexOf('.')); - - // The Macro Enabled formats are a little special - if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) { - docType = docType.toLowerCase(Locale.ROOT) + ".12"; - } - - if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) { - docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12"); - } - - // Build the MediaType object and return - return MediaType.parse(docType); - } - /** - * Detects Open XML Paper Specification (XPS) - */ - private static MediaType detectXPSOPC(OPCPackage pkg) { - PackageRelationshipCollection xps = - pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"); - if (xps.size() == 1) { - return MediaType.application("vnd.ms-xpsdocument"); - } else { - // Non-XPS Package received - return null; - } - } - /** - * Detects AutoCAD formats that live in OPC packaging - */ - private static MediaType detectAutoCADOPC(OPCPackage pkg) { - PackageRelationshipCollection dwfxSeq = - pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence"); - if (dwfxSeq.size() == 1) { - return MediaType.parse("model/vnd.dwfx+xps"); - } else { - // Non-AutoCAD Package received - return null; } } + private static MediaType detectIWork(ZipFile zip) { if (zip.getEntry(IWorkPackageParser.IWORK_COMMON_ENTRY) != null) {
