TIKA-2059 - Merge multimedia and pdf parser modules and bundles Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/59e0ca0f Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/59e0ca0f Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/59e0ca0f
Branch: refs/heads/2.x Commit: 59e0ca0fcb311b2207295f17dcf37ac293d28583 Parents: 87b6d5d Author: Bob Paulin <[email protected]> Authored: Sun Aug 28 11:29:22 2016 -0500 Committer: Bob Paulin <[email protected]> Committed: Sun Aug 28 11:29:22 2016 -0500 ---------------------------------------------------------------------- tika-parser-bundles/pom.xml | 1 - .../tika-parser-journal-bundle/pom.xml | 2 +- .../apache/tika/module/journal/BundleIT.java | 4 +- .../tika-parser-journal-bundle/test-bundles.xml | 2 +- .../tika-parser-multimedia-bundle/pom.xml | 20 +- .../apache/tika/module/multimedia/BundleIT.java | 2 +- .../tika-parser-pdf-bundle/pom.xml | 109 -- .../org/apache/tika/module/pdf/BundleIT.java | 96 -- .../tika-parser-pdf-bundle/test-bundles.xml | 34 - tika-parser-modules/pom.xml | 1 - .../tika-parser-journal-module/pom.xml | 2 +- .../tika-parser-multimedia-module/pom.xml | 59 + .../tika/parser/pdf/AbstractPDF2XHTML.java | 579 ++++++++ .../apache/tika/parser/pdf/AccessChecker.java | 81 ++ .../org/apache/tika/parser/pdf/OCR2XHTML.java | 125 ++ .../org/apache/tika/parser/pdf/PDF2XHTML.java | 339 +++++ .../parser/pdf/PDFEncodedStringDecoder.java | 119 ++ .../org/apache/tika/parser/pdf/PDFParser.java | 626 +++++++++ .../apache/tika/parser/pdf/PDFParserConfig.java | 614 +++++++++ .../apache/tika/parser/pdf/XFAExtractor.java | 304 +++++ .../services/org.apache.tika.parser.Parser | 3 +- .../apache/tika/parser/pdf/PDFParser.properties | 34 + .../tika/parser/pdf/AccessCheckerTest.java | 137 ++ .../apache/tika/parser/pdf/PDFParserTest.java | 1240 ++++++++++++++++++ .../tika-parser-pdf-module/pom.xml | 126 -- .../tika/module/pdf/internal/Activator.java | 36 - .../tika/parser/pdf/AbstractPDF2XHTML.java | 579 -------- .../apache/tika/parser/pdf/AccessChecker.java | 81 -- .../org/apache/tika/parser/pdf/OCR2XHTML.java | 125 -- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 339 ----- .../parser/pdf/PDFEncodedStringDecoder.java | 119 -- .../org/apache/tika/parser/pdf/PDFParser.java | 626 --------- .../apache/tika/parser/pdf/PDFParserConfig.java | 614 --------- .../apache/tika/parser/pdf/XFAExtractor.java | 304 ----- .../services/org.apache.tika.parser.Parser | 17 - .../apache/tika/parser/pdf/PDFParser.properties | 34 - .../tika/parser/pdf/AccessCheckerTest.java | 137 -- .../apache/tika/parser/pdf/PDFParserTest.java | 1240 ------------------ tika-parsers/pom.xml | 6 - 39 files changed, 4283 insertions(+), 4633 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/pom.xml b/tika-parser-bundles/pom.xml index bcaf4d1..37ab26d 100644 --- a/tika-parser-bundles/pom.xml +++ b/tika-parser-bundles/pom.xml @@ -52,7 +52,6 @@ <module>tika-parser-multimedia-bundle</module> <module>tika-parser-office-bundle</module> <module>tika-parser-package-bundle</module> - <module>tika-parser-pdf-bundle</module> <module>tika-parser-scientific-bundle</module> <module>tika-parser-text-bundle</module> <module>tika-parser-web-bundle</module> http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-journal-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-journal-bundle/pom.xml b/tika-parser-bundles/tika-parser-journal-bundle/pom.xml index b918a7a..02113be 100644 --- a/tika-parser-bundles/tika-parser-journal-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-journal-bundle/pom.xml @@ -32,7 +32,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-pdf-bundle</artifactId> + <artifactId>tika-parser-multimedia-bundle</artifactId> <version>${project.version}</version> </dependency> </dependencies> http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java index c8e8448..2d72e17 100644 --- a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java +++ b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java @@ -68,7 +68,7 @@ public class BundleIT { String bundleFileName = System.getProperty(BUNDLE_JAR_SYS_PROP); return options(junitBundles(), bundle(new File("target/test-bundles/tika-core.jar").toURI().toURL().toString()), - bundle(new File("target/test-bundles/tika-parser-pdf-bundle.jar").toURI().toURL().toString()), + bundle(new File("target/test-bundles/tika-parser-multimedia-bundle.jar").toURI().toURL().toString()), bundle(new File(bundleFileName).toURI().toString())); } @@ -92,6 +92,6 @@ public class BundleIT { @Test public void testServicesCreated() throws Exception { ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null); - assertEquals("Not all Services have started", 16, services.length); + assertEquals("Not all Services have started", 17, services.length); } } http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml b/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml index 8cd36d6..faea0bf 100644 --- a/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml +++ b/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml @@ -28,7 +28,7 @@ <outputFileNameMapping>${artifact.artifactId}.jar</outputFileNameMapping> <includes> <include>org.apache.tika:tika-core</include> - <include>org.apache.tika:tika-parser-pdf-bundle</include> + <include>org.apache.tika:tika-parser-multimedia-bundle</include> </includes> </dependencySet> </dependencySets> http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml index ab1d1b4..8a45cf7 100644 --- a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml @@ -54,6 +54,12 @@ jempbox;inline=true, fontbox;inline=true, isoparser;inline=true, + pdfbox;inline=true, + pdfbox-tools;inline=true, + pdfbox-debugger;inline=true, + bcmail-jdk15on, + bcprov-jdk15on, + bcpkix-jdk15on </Embed-Dependency> <Embed-Transitive>true</Embed-Transitive> <Export-Package> @@ -62,13 +68,23 @@ org.apache.tika.parser.audio.*, org.apache.tika.parser.video.*, org.apache.tika.parser.mp3.*, - org.apache.tika.parser.mp4.* + org.apache.tika.parser.mp4.*, + org.apache.tika.parser.pdf.* </Export-Package> <Import-Package> *, com.adobe.xmp;resolution:=optional, com.adobe.xmp.properties;resolution:=optional, - android.util;resolution:=optional + android.util;resolution:=optional, + javax.mail;resolution:=optional, + javax.mail.internet;resolution:=optional, + org.bouncycastle.cert;resolution:=optional, + org.bouncycastle.cert.jcajce;resolution:=optional, + org.bouncycastle.cert.ocsp;resolution:=optional, + org.bouncycastle.cms.bc;resolution:=optional, + org.bouncycastle.operator;resolution:=optional, + org.bouncycastle.operator.bc;resolution:=optional, + org.bouncycastle.tsp;resolution:=optional, </Import-Package> </instructions> </configuration> http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java b/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java index 9823899..53f3299 100644 --- a/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java +++ b/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java @@ -87,7 +87,7 @@ public class BundleIT { @Test public void testServicesCreated() throws Exception { ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null); - assertEquals("Not all Services have started", 15, services.length); + assertEquals("Not all Services have started", 16, services.length); } @Test http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml deleted file mode 100644 index fe1a269..0000000 --- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml +++ /dev/null @@ -1,109 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor - license agreements. See the NOTICE file distributed with this work for additional - information regarding copyright ownership. The ASF licenses this file to - you under the Apache License, Version 2.0 (the "License"); you may not use - this file except in compliance with the License. You may obtain a copy of - the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required - by applicable law or agreed to in writing, software distributed under the - License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS - OF ANY KIND, either express or implied. See the License for the specific - language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parser-bundles</artifactId> - <version>2.0-SNAPSHOT</version> - </parent> - - <artifactId>tika-parser-pdf-bundle</artifactId> - <packaging>bundle</packaging> - <name>Apache Tika parser pdf bundle</name> - <url>http://tika.apache.org/</url> - - <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-pdf-module</artifactId> - <version>${project.version}</version> - </dependency> - </dependencies> - - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-dependency-plugin</artifactId> - </plugin> - <plugin> - <groupId>org.apache.felix</groupId> - <artifactId>maven-bundle-plugin</artifactId> - <extensions>true</extensions> - <configuration> - <instructions> - <Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator> - <Embed-Dependency> - tika-parser-pdf-module;inline=true, - tika-parser-multimedia-module;inline=true, - tika-parser-xmp-commons;inline=true, - commons-io;inline=true, - pdfbox;inline=true, - pdfbox-tools;inline=true, - pdfbox-debugger;inline=true, - bcmail-jdk15on;inline=true, - bcprov-jdk15on;inline=true, - fontbox;inline=true, - jempbox;inline=true, - bcpkix-jdk15on;inline=true - </Embed-Dependency> - <Embed-Transitive>true</Embed-Transitive> - <Export-Package> - org.apache.tika.parser.pdf.* - </Export-Package> - <Import-Package> - *, - com.ibm.icu.text;resolution:=optional, - com.coremedia.iso;resolution:=optional, - com.coremedia.iso.boxes;resolution:=optional, - com.coremedia.iso.boxes.apple;resolution:=optional, - com.coremedia.iso.boxes.sampleentry;resolution:=optional, - com.drew.imaging.jpeg;resolution:=optional, - com.drew.imaging.riff;resolution:=optional, - com.drew.imaging.tiff;resolution:=optional, - com.drew.imaging.webp;resolution:=optional, - com.drew.lang;resolution:=optional, - com.drew.metadata;resolution:=optional, - com.drew.metadata.exif;resolution:=optional, - com.drew.metadata.iptc;resolution:=optional, - com.drew.metadata.jpeg;resolution:=optional, - com.googlecode.mp4parser;resolution:=optional, - com.googlecode.mp4parser.boxes.apple;resolution:=optional, - com.googlecode.mp4parser.util;resolution:=optional, - javax.mail;resolution:=optional, - javax.mail.internet;resolution:=optional, - org.bouncycastle.cert;resolution:=optional, - org.bouncycastle.cert.jcajce;resolution:=optional, - org.bouncycastle.cert.ocsp;resolution:=optional, - org.bouncycastle.cms.bc;resolution:=optional, - org.bouncycastle.operator;resolution:=optional, - org.bouncycastle.operator.bc;resolution:=optional, - org.bouncycastle.tsp;resolution:=optional, - org.apache.commons.exec;resolution:=optional, - org.apache.commons.exec.environment;resolution:=optional - </Import-Package> - </instructions> - </configuration> - </plugin> - <plugin> - <artifactId>maven-failsafe-plugin</artifactId> - </plugin> - <plugin> - <artifactId>maven-assembly-plugin</artifactId> - </plugin> - </plugins> - </build> - -</project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java b/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java deleted file mode 100644 index 8e1d010..0000000 --- a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.module.pdf; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertNotNull; -import static org.ops4j.pax.exam.CoreOptions.bundle; -import static org.ops4j.pax.exam.CoreOptions.junitBundles; -import static org.ops4j.pax.exam.CoreOptions.options; -import static org.ops4j.pax.exam.CoreOptions.mavenBundle; - -import javax.inject.Inject; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.StringWriter; -import java.io.Writer; -import java.net.URISyntaxException; -import java.util.Dictionary; - -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.mime.MediaType; -import org.apache.tika.osgi.TikaService; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.BodyContentHandler; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.ops4j.pax.exam.Configuration; -import org.ops4j.pax.exam.Option; -import org.ops4j.pax.exam.junit.PaxExam; -import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy; -import org.ops4j.pax.exam.spi.reactors.PerMethod; -import org.osgi.framework.Bundle; -import org.osgi.framework.BundleContext; -import org.osgi.framework.ServiceReference; -import org.xml.sax.ContentHandler; - -@RunWith(PaxExam.class) -@ExamReactorStrategy(PerMethod.class) -public class BundleIT { - - private static final String BUNDLE_JAR_SYS_PROP = "project.bundle.file"; - - @Inject - private BundleContext bc; - - @Configuration - public Option[] configuration() throws IOException, URISyntaxException { - String bundleFileName = System.getProperty(BUNDLE_JAR_SYS_PROP); - return options(junitBundles(), - bundle(new File("target/test-bundles/tika-core.jar").toURI().toURL().toString()), - bundle(new File(bundleFileName).toURI().toString())); - } - - @Test - public void testBundleLoaded() throws Exception { - boolean hasCore = false, hasBundle = false; - for (Bundle b : bc.getBundles()) { - if ("org.apache.tika.core".equals(b.getSymbolicName())) { - hasCore = true; - assertEquals("Core not activated", Bundle.ACTIVE, b.getState()); - } - if ("org.apache.tika.parser-pdf-bundle".equals(b.getSymbolicName())) { - hasBundle = true; - assertEquals("Bundle not activated", Bundle.ACTIVE, b.getState()); - } - } - assertTrue("Core bundle not found", hasCore); - assertTrue("PDF bundle not found", hasBundle); - } - - @Test - public void testServicesCreated() throws Exception { - ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null); - assertEquals("Not all Services have started", 15, services.length); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-pdf-bundle/test-bundles.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/test-bundles.xml b/tika-parser-bundles/tika-parser-pdf-bundle/test-bundles.xml deleted file mode 100644 index 53bb6e4..0000000 --- a/tika-parser-bundles/tika-parser-pdf-bundle/test-bundles.xml +++ /dev/null @@ -1,34 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - --> -<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd"> - <id>bundles</id> - <formats> - <format>dir</format> - </formats> - <includeBaseDirectory>false</includeBaseDirectory> - <dependencySets> - <dependencySet> - <outputDirectory/> - <outputFileNameMapping>${artifact.artifactId}.jar</outputFileNameMapping> - <includes> - <include>org.apache.tika:tika-core</include> - </includes> - </dependencySet> - </dependencySets> -</assembly> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml index 6912f8b..cd13ba7 100644 --- a/tika-parser-modules/pom.xml +++ b/tika-parser-modules/pom.xml @@ -56,7 +56,6 @@ <module>tika-parser-multimedia-module</module> <module>tika-parser-office-module</module> <module>tika-parser-package-module</module> - <module>tika-parser-pdf-module</module> <module>tika-parser-scientific-module</module> <module>tika-parser-text-module</module> <module>tika-parser-web-module</module> http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-journal-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-journal-module/pom.xml b/tika-parser-modules/tika-parser-journal-module/pom.xml index c45c2a9..8c2f3d7 100644 --- a/tika-parser-modules/tika-parser-journal-module/pom.xml +++ b/tika-parser-modules/tika-parser-journal-module/pom.xml @@ -45,7 +45,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-pdf-module</artifactId> + <artifactId>tika-parser-multimedia-module</artifactId> <version>${project.version}</version> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml index 74cb504..9bdc5eb 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml +++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml @@ -71,6 +71,34 @@ <version>${pdfbox.version}</version> </dependency> <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>pdfbox</artifactId> + <version>${pdfbox.version}</version> + </dependency> + <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>pdfbox-tools</artifactId> + <version>${pdfbox.version}</version> + </dependency> + <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>jempbox</artifactId> + <version>${jempbox.version}</version> + </dependency> + <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies + as optional, but we prefer to have them always to avoid + problems with encrypted PDFs. --> + <dependency> + <groupId>org.bouncycastle</groupId> + <artifactId>bcmail-jdk15on</artifactId> + <version>${bouncycastle.version}</version> + </dependency> + <dependency> + <groupId>org.bouncycastle</groupId> + <artifactId>bcprov-jdk15on</artifactId> + <version>${bouncycastle.version}</version> + </dependency> + <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> <version>${commons.logging.version}</version> @@ -87,6 +115,37 @@ <version>${project.version}</version> <scope>test</scope> </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-package-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-text-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-office-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <!-- Copied from PDFBox: + For legal reasons (incompatible license), jai-imageio-core is to be used + only in the tests and may not be distributed. See also LEGAL-195 --> + <dependency> + <groupId>com.github.jai-imageio</groupId> + <artifactId>jai-imageio-core</artifactId> + <scope>test</scope> + </dependency> </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java new file mode 100644 index 0000000..832b06e --- /dev/null +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -0,0 +1,579 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.awt.image.BufferedImage; +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.List; +import java.util.ListIterator; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; + +import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; + +import javax.xml.stream.XMLStreamException; +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; +import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; +import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; +import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; +import org.apache.pdfbox.pdmodel.interactive.action.PDAction; +import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; +import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDField; +import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; +import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; +import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.tools.imageio.ImageIOUtil; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.ocr.TesseractOCRConfig; +import org.apache.tika.parser.ocr.TesseractOCRParser; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; + +class AbstractPDF2XHTML extends PDFTextStripper { + + /** + * Maximum recursive depth during AcroForm processing. + * Prevents theoretical AcroForm recursion bomb. + */ + private final static int MAX_ACROFORM_RECURSIONS = 10; + + private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig(); + + /** + * Format used for signature dates + * TODO Make this thread-safe + */ + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); + + + final List<IOException> exceptions = new ArrayList<>(); + final PDDocument pdDocument; + final XHTMLContentHandler xhtml; + private final ParseContext context; + private final Metadata metadata; + final PDFParserConfig config; + + private int pageIndex = 0; + + AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) throws IOException { + this.pdDocument = pdDocument; + this.xhtml = new XHTMLContentHandler(handler, metadata); + this.context = context; + this.metadata = metadata; + this.config = config; + } + + @Override + protected void startPage(PDPage page) throws IOException { + try { + xhtml.startElement("div", "class", "page"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a page", e); + } + writeParagraphStart(); + } + + EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { + EmbeddedDocumentExtractor extractor = + context.get(EmbeddedDocumentExtractor.class); + if (extractor == null) { + extractor = new ParsingEmbeddedDocumentExtractor(context); + } + return extractor; + } + + private void extractEmbeddedDocuments(PDDocument document) + throws IOException, SAXException, TikaException { + PDDocumentNameDictionary namesDictionary = + new PDDocumentNameDictionary(document.getDocumentCatalog()); + PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); + if (efTree == null) { + return; + } + + Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); + //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. + //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java + //If there is a need we could add a fully recursive search to find a non-null + //Map<String, COSObjectable> that contains the doc info. + if (embeddedFileNames != null) { + processEmbeddedDocNames(embeddedFileNames); + } else { + List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); + if (kids == null) { + return; + } + for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { + embeddedFileNames = node.getNames(); + if (embeddedFileNames != null) { + processEmbeddedDocNames(embeddedFileNames); + } + } + } + } + + private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames) + throws IOException, SAXException, TikaException { + if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { + return; + } + + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); + for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { + PDComplexFileSpecification spec = ent.getValue(); + extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); + } + } + + private void extractMultiOSPDEmbeddedFiles(String displayName, + PDComplexFileSpecification spec, + EmbeddedDocumentExtractor extractor) throws IOException, + SAXException, TikaException { + + if (spec == null) { + return; + } + //current strategy is to pull all, not just first non-null + extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), extractor); + extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor); + extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor); + extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor); + } + + private void extractPDEmbeddedFile(String displayName, String unicodeFileName, + String fileName, PDEmbeddedFile file, + EmbeddedDocumentExtractor extractor) + throws SAXException, IOException, TikaException { + + if (file == null) { + //skip silently + return; + } + + fileName = (fileName == null) ? displayName : fileName; + + // TODO: other metadata? + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); + metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); + metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); + + if (extractor.shouldParseEmbedded(metadata)) { + TikaInputStream stream = null; + try { + stream = TikaInputStream.get(file.createInputStream()); + extractor.parseEmbedded( + stream, + new EmbeddedContentHandler(xhtml), + metadata, false); + + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", fileName); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } finally { + IOUtils.closeQuietly(stream); + } + } + } + + void handleCatchableIOE(IOException e) throws IOException { + if (config.isCatchIntermediateIOExceptions()) { + String msg = e.getMessage(); + if (msg == null) { + msg = "IOException, no message"; + } + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg); + exceptions.add(e); + } else { + throw e; + } + } + + void doOCROnCurrentPage() throws IOException, TikaException, SAXException { + if (config.getOCRStrategy().equals(NO_OCR)) { + return; + } + TesseractOCRConfig tesseractConfig = + context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); + + TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); + if (! tesseractOCRParser.hasTesseract(tesseractConfig)) { + throw new TikaException("Tesseract is not available. "+ + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); + } + + PDFRenderer renderer = new PDFRenderer(pdDocument); + TemporaryResources tmp = new TemporaryResources(); + try { + BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType()); + Path tmpFile = tmp.createTempFile(); + try (OutputStream os = Files.newOutputStream(tmpFile)) { + //TODO: get output format from TesseractConfig + ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), + os, config.getOCRDPI()); + } + try (InputStream is = TikaInputStream.get(tmpFile)) { + tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); + } + } catch (IOException e) { + handleCatchableIOE(e); + } catch (SAXException e) { + throw new IOExceptionWithCause("error writing OCR content from PDF", e); + } finally { + tmp.dispose(); + } + } + + @Override + protected void endPage(PDPage page) throws IOException { + + try { + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); + for (PDAnnotation annotation : page.getAnnotations()) { + + if (annotation instanceof PDAnnotationFileAttachment) { + PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; + PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); + try { + extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor); + } catch (SAXException e) { + throw new IOExceptionWithCause("file embedded in annotation sax exception", e); + } catch (TikaException e) { + throw new IOExceptionWithCause("file embedded in annotation tika exception", e); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + // TODO: remove once PDFBOX-1143 is fixed: + if (config.getExtractAnnotationText()) { + if (annotation instanceof PDAnnotationLink) { + PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; + if (annotationlink.getAction() != null) { + PDAction action = annotationlink.getAction(); + if (action instanceof PDActionURI) { + PDActionURI uri = (PDActionURI) action; + String link = uri.getURI(); + if (link != null) { + xhtml.startElement("div", "class", "annotation"); + xhtml.startElement("a", "href", link); + xhtml.endElement("a"); + xhtml.endElement("div"); + } + } + } + } + + if (annotation instanceof PDAnnotationMarkup) { + PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; + String title = annotationMarkup.getTitlePopup(); + String subject = annotationMarkup.getSubject(); + String contents = annotationMarkup.getContents(); + // TODO: maybe also annotationMarkup.getRichContents()? + if (title != null || subject != null || contents != null) { + xhtml.startElement("div", "class", "annotation"); + + if (title != null) { + xhtml.startElement("div", "class", "annotationTitle"); + xhtml.characters(title); + xhtml.endElement("div"); + } + + if (subject != null) { + xhtml.startElement("div", "class", "annotationSubject"); + xhtml.characters(subject); + xhtml.endElement("div"); + } + + if (contents != null) { + xhtml.startElement("div", "class", "annotationContents"); + xhtml.characters(contents); + xhtml.endElement("div"); + } + + xhtml.endElement("div"); + } + } + } + } + if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { + doOCROnCurrentPage(); + } + xhtml.endElement("div"); + } catch (SAXException|TikaException e) { + throw new IOExceptionWithCause("Unable to end a page", e); + } catch (IOException e) { + exceptions.add(e); + } finally { + pageIndex++; + } + } + + @Override + protected void startDocument(PDDocument pdf) throws IOException { + try { + xhtml.startDocument(); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a document", e); + } + } + + @Override + protected void endDocument(PDDocument pdf) throws IOException { + try { + // Extract text for any bookmarks: + extractBookmarkText(); + try { + extractEmbeddedDocuments(pdf); + } catch (IOException e) { + handleCatchableIOE(e); + } + + //extract acroform data at end of doc + if (config.getExtractAcroFormContent() == true) { + try { + extractAcroForm(pdf); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + xhtml.endDocument(); + } catch (TikaException e) { + throw new IOExceptionWithCause("Unable to end a document", e); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to end a document", e); + } + } + + void extractBookmarkText() throws SAXException { + PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); + if (outline != null) { + extractBookmarkText(outline); + } + } + + void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { + PDOutlineItem current = bookmark.getFirstChild(); + if (current != null) { + xhtml.startElement("ul"); + while (current != null) { + xhtml.startElement("li"); + xhtml.characters(current.getTitle()); + xhtml.endElement("li"); + // Recurse: + extractBookmarkText(current); + current = current.getNextSibling(); + } + xhtml.endElement("ul"); + } + } + + void extractAcroForm(PDDocument pdf) throws IOException, + SAXException { + //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields + //this code derives from Ben's code + PDDocumentCatalog catalog = pdf.getDocumentCatalog(); + + if (catalog == null) + return; + + PDAcroForm form = catalog.getAcroForm(); + if (form == null) + return; + + //if it has xfa, try that. + //if it doesn't exist or there's an exception, + //go with traditional AcroForm + PDXFAResource pdxfa = form.getXFA(); + + if (pdxfa != null) { + //if successful, return + XFAExtractor xfaExtractor = new XFAExtractor(); + try (InputStream is = new BufferedInputStream( + new ByteArrayInputStream(pdxfa.getBytes()))) { + xfaExtractor.extract(is, xhtml, metadata, context); + return; + } catch (XMLStreamException |IOException e) { + //if there was an xml parse exception in xfa, try the AcroForm + } + } + + @SuppressWarnings("rawtypes") + List fields = form.getFields(); + + if (fields == null) + return; + + @SuppressWarnings("rawtypes") + ListIterator itr = fields.listIterator(); + + if (itr == null) + return; + + xhtml.startElement("div", "class", "acroform"); + xhtml.startElement("ol"); + + while (itr.hasNext()) { + Object obj = itr.next(); + if (obj != null && obj instanceof PDField) { + processAcroField((PDField) obj, 0); + } + } + xhtml.endElement("ol"); + xhtml.endElement("div"); + } + + private void processAcroField(PDField field, final int currentRecursiveDepth) + throws SAXException, IOException { + + if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { + return; + } + addFieldString(field); + if (field instanceof PDNonTerminalField) { + int r = currentRecursiveDepth + 1; + xhtml.startElement("ol"); + for (PDField child : ((PDNonTerminalField)field).getChildren()) { + processAcroField(child, r); + } + xhtml.endElement("ol"); + } + } + + private void addFieldString(PDField field) throws SAXException { + //Pick partial name to present in content and altName for attribute + //Ignoring FullyQualifiedName for now + String partName = field.getPartialName(); + String altName = field.getAlternateFieldName(); + + StringBuilder sb = new StringBuilder(); + AttributesImpl attrs = new AttributesImpl(); + + if (partName != null) { + sb.append(partName).append(": "); + } + if (altName != null) { + attrs.addAttribute("", "altName", "altName", "CDATA", altName); + } + //return early if PDSignature field + if (field instanceof PDSignatureField) { + handleSignature(attrs, (PDSignatureField) field); + return; + } + String value = field.getValueAsString(); + if (value != null && !value.equals("null")) { + sb.append(value); + } + + if (attrs.getLength() > 0 || sb.length() > 0) { + xhtml.startElement("li", attrs); + xhtml.characters(sb.toString()); + xhtml.endElement("li"); + } + } + + private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField) + throws SAXException { + + PDSignature sig = sigField.getSignature(); + if (sig == null) { + return; + } + Map<String, String> vals = new TreeMap<>(); + vals.put("name", sig.getName()); + vals.put("contactInfo", sig.getContactInfo()); + vals.put("location", sig.getLocation()); + vals.put("reason", sig.getReason()); + + Calendar cal = sig.getSignDate(); + if (cal != null) { + dateFormat.setTimeZone(cal.getTimeZone()); + vals.put("date", dateFormat.format(cal.getTime())); + } + //see if there is any data + int nonNull = 0; + for (String val : vals.keySet()) { + if (val != null && !val.equals("")) { + nonNull++; + } + } + //if there is, process it + if (nonNull > 0) { + xhtml.startElement("li", parentAttributes); + + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); + + xhtml.startElement("ol", attrs); + for (Map.Entry<String, String> e : vals.entrySet()) { + if (e.getValue() == null || e.getValue().equals("")) { + continue; + } + attrs = new AttributesImpl(); + attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); + xhtml.startElement("li", attrs); + xhtml.characters(e.getValue()); + xhtml.endElement("li"); + } + xhtml.endElement("ol"); + xhtml.endElement("li"); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java new file mode 100644 index 0000000..775e590 --- /dev/null +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pdf; + +import java.io.Serializable; + +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; + +/** + * Checks whether or not a document allows extraction generally + * or extraction for accessibility only. + */ +public class AccessChecker implements Serializable { + + private static final long serialVersionUID = 6492570218190936986L; + + private final boolean needToCheck; + private final boolean allowAccessibility; + + /** + * This constructs an {@link AccessChecker} that + * will not perform any checking and will always return without + * throwing an exception. + * <p/> + * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior. + */ + public AccessChecker() { + needToCheck = false; + allowAccessibility = true; + } + + /** + * This constructs an {@link AccessChecker} that will check + * for whether or not content should be extracted from a document. + * + * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed + */ + public AccessChecker(boolean allowExtractionForAccessibility) { + needToCheck = true; + this.allowAccessibility = allowExtractionForAccessibility; + } + + /** + * Checks to see if a document's content should be extracted based + * on metadata values and the value of {@link #allowAccessibility} in the constructor. + * + * @param metadata + * @throws AccessPermissionException if access is not permitted + */ + public void check(Metadata metadata) throws AccessPermissionException { + if (!needToCheck) { + return; + } + if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { + if (allowAccessibility) { + if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { + return; + } + throw new AccessPermissionException("Content extraction for accessibility is not allowed."); + } + throw new AccessPermissionException("Content extraction is not allowed."); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java new file mode 100644 index 0000000..3ad551d --- /dev/null +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.io.IOException; +import java.io.Writer; + +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + + +/** + * Utility class that overrides the {@link PDFTextStripper} functionality + * to integrate text extraction via OCR only. + * + */ +class OCR2XHTML extends AbstractPDF2XHTML { + + private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws IOException { + super(document, handler, context, metadata, config); + } + + /** + * Converts the given PDF document (and related metadata) to a stream + * of XHTML SAX events sent to the given content handler. + * + * @param document PDF document + * @param handler SAX content handler + * @param metadata PDF metadata + * @throws SAXException if the content handler fails to process SAX events + * @throws TikaException if there was an exception outside of per page processing + */ + public static void process( + PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws SAXException, TikaException { + OCR2XHTML ocr2XHTML = null; + try { + ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config); + ocr2XHTML.writeText(document, new Writer() { + @Override + public void write(char[] cbuf, int off, int len) { + } + + @Override + public void flush() { + } + + @Override + public void close() { + } + }); + } catch (IOException e) { + if (e.getCause() instanceof SAXException) { + throw (SAXException) e.getCause(); + } else { + throw new TikaException("Unable to extract PDF content", e); + } + } + if (ocr2XHTML.exceptions.size() > 0) { + //throw the first + throw new TikaException("Unable to extract all PDF content", + ocr2XHTML.exceptions.get(0)); + } + } + + @Override + public void processPage(PDPage pdPage) throws IOException { + try { + startPage(pdPage); + doOCROnCurrentPage(); + endPage(pdPage); + } catch (TikaException |SAXException e) { + throw new IOExceptionWithCause(e); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + + @Override + protected void writeString(String text) throws IOException { + //no-op + } + + @Override + protected void writeCharacters(TextPosition text) throws IOException { + //no-op + } + + @Override + protected void writeWordSeparator() throws IOException { + //no-op + } + + @Override + protected void writeLineSeparator() throws IOException { + //no-op + } + +} + http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java new file mode 100644 index 0000000..ac9823e --- /dev/null +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -0,0 +1,339 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.Writer; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.tools.imageio.ImageIOUtil; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Utility class that overrides the {@link PDFTextStripper} functionality + * to produce a semi-structured XHTML SAX events instead of a plain text + * stream. + */ +class PDF2XHTML extends AbstractPDF2XHTML { + + + private static final List<String> JPEG = Arrays.asList( + COSName.DCT_DECODE.getName(), + COSName.DCT_DECODE_ABBREVIATION.getName()); + + /** + * This keeps track of the pdf object ids for inline + * images that have been processed. + * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly() + * is true, this will be checked before extracting an embedded image. + * The integer keeps track of the inlineImageCounter for that image. + * This integer is used to identify images in the markup. + * + * This is used across the document. To avoid infinite recursion + * TIKA-1742, we're limiting the export to one image per page. + */ + private Map<COSStream, Integer> processedInlineImages = new HashMap<>(); + private int inlineImageCounter = 0; + private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws IOException { + super(document, handler, context, metadata, config); + } + + /** + * Converts the given PDF document (and related metadata) to a stream + * of XHTML SAX events sent to the given content handler. + * + * @param document PDF document + * @param handler SAX content handler + * @param metadata PDF metadata + * @throws SAXException if the content handler fails to process SAX events + * @throws TikaException if there was an exception outside of per page processing + */ + public static void process( + PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws SAXException, TikaException { + PDF2XHTML pdf2XHTML = null; + try { + // Extract text using a dummy Writer as we override the + // key methods to output to the given content + // handler. + pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config); + + config.configure(pdf2XHTML); + + pdf2XHTML.writeText(document, new Writer() { + @Override + public void write(char[] cbuf, int off, int len) { + } + + @Override + public void flush() { + } + + @Override + public void close() { + } + }); + + } catch (IOException e) { + if (e.getCause() instanceof SAXException) { + throw (SAXException) e.getCause(); + } else { + throw new TikaException("Unable to extract PDF content", e); + } + } + if (pdf2XHTML.exceptions.size() > 0) { + //throw the first + throw new TikaException("Unable to extract all PDF content", + pdf2XHTML.exceptions.get(0)); + } + } + + + @Override + public void processPage(PDPage page) throws IOException { + try { + super.processPage(page); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + + @Override + protected void endPage(PDPage page) throws IOException { + try { + writeParagraphEnd(); + try { + extractImages(page.getResources(), new HashSet<COSBase>()); + } catch (IOException e) { + handleCatchableIOE(e); + } + super.endPage(page); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to end a page", e); + } catch (IOException e) { + exceptions.add(e); + } + } + + private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException { + if (resources == null || config.getExtractInlineImages() == false) { + return; + } + + for (COSName name : resources.getXObjectNames()) { + + PDXObject object = resources.getXObject(name); + if (object == null) { + continue; + } + COSStream cosStream = object.getCOSObject(); + if (seenThisPage.contains(cosStream)) { + //avoid infinite recursion TIKA-1742 + continue; + } + seenThisPage.add(cosStream); + + if (object instanceof PDFormXObject) { + extractImages(((PDFormXObject) object).getResources(), seenThisPage); + } else if (object instanceof PDImageXObject) { + + PDImageXObject image = (PDImageXObject) object; + + Metadata metadata = new Metadata(); + String extension = image.getSuffix(); + if (extension == null) { + metadata.set(Metadata.CONTENT_TYPE, "image/png"); + extension = "png"; + } else if (extension.equals("jpg")) { + metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); + } else if (extension.equals("tiff")) { + metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); + extension = "tif"; + } else { + //TODO: determine if we need to add more image types + //throw new RuntimeException("EXTEN:" + extension); + } + + Integer imageNumber = processedInlineImages.get(cosStream); + if (imageNumber == null) { + imageNumber = inlineImageCounter++; + } + String fileName = "image" + imageNumber + "."+extension; + metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); + + // Output the img tag + AttributesImpl attr = new AttributesImpl(); + attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); + attr.addAttribute("", "alt", "alt", "CDATA", fileName); + xhtml.startElement("img", attr); + xhtml.endElement("img"); + + //Do we only want to process unique COSObject ids? + //If so, have we already processed this one? + if (config.getExtractUniqueInlineImagesOnly() == true) { + if (processedInlineImages.containsKey(cosStream)) { + continue; + } + processedInlineImages.put(cosStream, imageNumber); + } + + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + + EmbeddedDocumentExtractor extractor = + getEmbeddedDocumentExtractor(); + if (extractor.shouldParseEmbedded(metadata)) { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + try { + //TODO: handle image.getMetadata()? + writeToBuffer(image, extension, buffer); + extractor.parseEmbedded( + new ByteArrayInputStream(buffer.toByteArray()), + new EmbeddedContentHandler(xhtml), + metadata, false); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + } + } + } + + //nearly directly copied from PDFBox ExtractImages + private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream out) + throws IOException { + + BufferedImage image = pdImage.getImage(); + if (image != null) { + if ("jpg".equals(suffix)) { + String colorSpaceName = pdImage.getColorSpace().getName(); + //TODO: figure out if we want directJPEG as a configuration + //previously: if (directJPeg || PDDeviceGray.... + if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || + PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) { + // RGB or Gray colorspace: get and write the unmodifiedJPEG stream + InputStream data = pdImage.getStream().createInputStream(JPEG); + org.apache.pdfbox.io.IOUtils.copy(data, out); + org.apache.pdfbox.io.IOUtils.closeQuietly(data); + } else { + // for CMYK and other "unusual" colorspaces, the JPEG will be converted + ImageIOUtil.writeImage(image, suffix, out); + } + } else { + ImageIOUtil.writeImage(image, suffix, out); + } + } + out.flush(); + } + + @Override + protected void writeParagraphStart() throws IOException { + super.writeParagraphStart(); + try { + xhtml.startElement("p"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a paragraph", e); + } + } + + @Override + protected void writeParagraphEnd() throws IOException { + super.writeParagraphEnd(); + try { + xhtml.endElement("p"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to end a paragraph", e); + } + } + + @Override + protected void writeString(String text) throws IOException { + try { + xhtml.characters(text); + } catch (SAXException e) { + throw new IOExceptionWithCause( + "Unable to write a string: " + text, e); + } + } + + @Override + protected void writeCharacters(TextPosition text) throws IOException { + try { + xhtml.characters(text.getUnicode()); + } catch (SAXException e) { + throw new IOExceptionWithCause( + "Unable to write a character: " + text.getUnicode(), e); + } + } + + @Override + protected void writeWordSeparator() throws IOException { + try { + xhtml.characters(getWordSeparator()); + } catch (SAXException e) { + throw new IOExceptionWithCause( + "Unable to write a space character", e); + } + } + + @Override + protected void writeLineSeparator() throws IOException { + try { + xhtml.newline(); + } catch (SAXException e) { + throw new IOExceptionWithCause( + "Unable to write a newline character", e); + } + } + +} + http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java new file mode 100644 index 0000000..057f833 --- /dev/null +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pdf; + +import static java.nio.charset.StandardCharsets.ISO_8859_1; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.io.RandomAccessBuffer; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.pdfparser.COSParser; + +/** + * In fairly rare cases, a PDF's XMP will contain a string that + * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and + * ascii for ascii, e.g. "\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000" + * <p> + * This class can be used to decode those strings. + * <p> + * See TIKA-1678. Many thanks to Andrew Jackson for raising this issue + * and Tilman Hausherr for the solution. + * <p> + * As of this writing, we are only handling strings that start with + * an encoded BOM. Andrew Jackson found a handful of other examples (e.g. + * this ISO-8859-7 string: + * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336 + * \\364\\347\\362 PRAKSIS \\363\\364\\357") + * that we aren't currently handling. + */ +class PDFEncodedStringDecoder { + + private static final String[] PDF_ENCODING_BOMS = { + "\\376\\377", //UTF-16BE + "\\377\\376", //UTF-16LE + "\\357\\273\\277"//UTF-8 + }; + + /** + * Does this string contain an octal-encoded UTF BOM? + * Call this statically to determine if you should bother creating a new parser to parse it. + * @param s + * @return + */ + static boolean shouldDecode(String s) { + if (s == null || s.length() < 8) { + return false; + } + for (String BOM : PDF_ENCODING_BOMS) { + if (s.startsWith(BOM)) { + return true; + } + } + return false; + } + + /** + * This assumes that {@link #shouldDecode(String)} has been called + * and has returned true. If you run this on a non-octal encoded string, + * disaster will happen! + * + * @param value + * @return + */ + String decode(String value) { + try { + byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1); + InputStream is = new ByteArrayInputStream(bytes); + COSStringParser p = new COSStringParser(new RandomAccessBuffer(is)); + String parsed = p.myParseCOSString(); + if (parsed != null) { + return parsed; + } + } catch (IOException e) { + //oh well, we tried. + } + //just return value if something went wrong + return value; + } + + class COSStringParser extends COSParser { + + COSStringParser(RandomAccessRead buffer) throws IOException { + super(buffer); + } + + /** + * + * @return parsed string or null if something went wrong. + */ + String myParseCOSString() { + try { + COSString cosString = parseCOSString(); + if (cosString != null) { + return cosString.getString(); + } + } catch (IOException e) { + } + return null; + } + } +}
