TIKA-2059 - Merge multimedia and pdf parser modules and bundles

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/59e0ca0f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/59e0ca0f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/59e0ca0f

Branch: refs/heads/2.x
Commit: 59e0ca0fcb311b2207295f17dcf37ac293d28583
Parents: 87b6d5d
Author: Bob Paulin <[email protected]>
Authored: Sun Aug 28 11:29:22 2016 -0500
Committer: Bob Paulin <[email protected]>
Committed: Sun Aug 28 11:29:22 2016 -0500

----------------------------------------------------------------------
 tika-parser-bundles/pom.xml                     |    1 -
 .../tika-parser-journal-bundle/pom.xml          |    2 +-
 .../apache/tika/module/journal/BundleIT.java    |    4 +-
 .../tika-parser-journal-bundle/test-bundles.xml |    2 +-
 .../tika-parser-multimedia-bundle/pom.xml       |   20 +-
 .../apache/tika/module/multimedia/BundleIT.java |    2 +-
 .../tika-parser-pdf-bundle/pom.xml              |  109 --
 .../org/apache/tika/module/pdf/BundleIT.java    |   96 --
 .../tika-parser-pdf-bundle/test-bundles.xml     |   34 -
 tika-parser-modules/pom.xml                     |    1 -
 .../tika-parser-journal-module/pom.xml          |    2 +-
 .../tika-parser-multimedia-module/pom.xml       |   59 +
 .../tika/parser/pdf/AbstractPDF2XHTML.java      |  579 ++++++++
 .../apache/tika/parser/pdf/AccessChecker.java   |   81 ++
 .../org/apache/tika/parser/pdf/OCR2XHTML.java   |  125 ++
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |  339 +++++
 .../parser/pdf/PDFEncodedStringDecoder.java     |  119 ++
 .../org/apache/tika/parser/pdf/PDFParser.java   |  626 +++++++++
 .../apache/tika/parser/pdf/PDFParserConfig.java |  614 +++++++++
 .../apache/tika/parser/pdf/XFAExtractor.java    |  304 +++++
 .../services/org.apache.tika.parser.Parser      |    3 +-
 .../apache/tika/parser/pdf/PDFParser.properties |   34 +
 .../tika/parser/pdf/AccessCheckerTest.java      |  137 ++
 .../apache/tika/parser/pdf/PDFParserTest.java   | 1240 ++++++++++++++++++
 .../tika-parser-pdf-module/pom.xml              |  126 --
 .../tika/module/pdf/internal/Activator.java     |   36 -
 .../tika/parser/pdf/AbstractPDF2XHTML.java      |  579 --------
 .../apache/tika/parser/pdf/AccessChecker.java   |   81 --
 .../org/apache/tika/parser/pdf/OCR2XHTML.java   |  125 --
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |  339 -----
 .../parser/pdf/PDFEncodedStringDecoder.java     |  119 --
 .../org/apache/tika/parser/pdf/PDFParser.java   |  626 ---------
 .../apache/tika/parser/pdf/PDFParserConfig.java |  614 ---------
 .../apache/tika/parser/pdf/XFAExtractor.java    |  304 -----
 .../services/org.apache.tika.parser.Parser      |   17 -
 .../apache/tika/parser/pdf/PDFParser.properties |   34 -
 .../tika/parser/pdf/AccessCheckerTest.java      |  137 --
 .../apache/tika/parser/pdf/PDFParserTest.java   | 1240 ------------------
 tika-parsers/pom.xml                            |    6 -
 39 files changed, 4283 insertions(+), 4633 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/pom.xml b/tika-parser-bundles/pom.xml
index bcaf4d1..37ab26d 100644
--- a/tika-parser-bundles/pom.xml
+++ b/tika-parser-bundles/pom.xml
@@ -52,7 +52,6 @@
     <module>tika-parser-multimedia-bundle</module>
     <module>tika-parser-office-bundle</module>
     <module>tika-parser-package-bundle</module>
-    <module>tika-parser-pdf-bundle</module>
     <module>tika-parser-scientific-bundle</module>
     <module>tika-parser-text-bundle</module>
     <module>tika-parser-web-bundle</module>

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-journal-bundle/pom.xml 
b/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
index b918a7a..02113be 100644
--- a/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-journal-bundle/pom.xml
@@ -32,7 +32,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-pdf-bundle</artifactId>
+      <artifactId>tika-parser-multimedia-bundle</artifactId>
       <version>${project.version}</version>
     </dependency>
   </dependencies>

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
 
b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
index c8e8448..2d72e17 100644
--- 
a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
+++ 
b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
@@ -68,7 +68,7 @@ public class BundleIT {
         String bundleFileName = System.getProperty(BUNDLE_JAR_SYS_PROP);
         return options(junitBundles(), 
                 bundle(new 
File("target/test-bundles/tika-core.jar").toURI().toURL().toString()),
-                bundle(new 
File("target/test-bundles/tika-parser-pdf-bundle.jar").toURI().toURL().toString()),
+                bundle(new 
File("target/test-bundles/tika-parser-multimedia-bundle.jar").toURI().toURL().toString()),
                 bundle(new File(bundleFileName).toURI().toString()));
     }
 
@@ -92,6 +92,6 @@ public class BundleIT {
     @Test
     public void testServicesCreated() throws Exception {
         ServiceReference[] services = 
bc.getAllServiceReferences(Parser.class.getName(), null);
-        assertEquals("Not all Services have started", 16, services.length);
+        assertEquals("Not all Services have started", 17, services.length);
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml 
b/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml
index 8cd36d6..faea0bf 100644
--- a/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml
+++ b/tika-parser-bundles/tika-parser-journal-bundle/test-bundles.xml
@@ -28,7 +28,7 @@
       <outputFileNameMapping>${artifact.artifactId}.jar</outputFileNameMapping>
       <includes>
         <include>org.apache.tika:tika-core</include>
-        <include>org.apache.tika:tika-parser-pdf-bundle</include>
+        <include>org.apache.tika:tika-parser-multimedia-bundle</include>
       </includes>
     </dependencySet>
   </dependencySets>

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml 
b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
index ab1d1b4..8a45cf7 100644
--- a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
@@ -54,6 +54,12 @@
               jempbox;inline=true,
               fontbox;inline=true,
               isoparser;inline=true,
+              pdfbox;inline=true,
+              pdfbox-tools;inline=true,
+              pdfbox-debugger;inline=true,
+              bcmail-jdk15on,
+              bcprov-jdk15on,
+              bcpkix-jdk15on
             </Embed-Dependency>
             <Embed-Transitive>true</Embed-Transitive>
             <Export-Package>
@@ -62,13 +68,23 @@
               org.apache.tika.parser.audio.*,
               org.apache.tika.parser.video.*,
               org.apache.tika.parser.mp3.*,
-              org.apache.tika.parser.mp4.*
+              org.apache.tika.parser.mp4.*,
+              org.apache.tika.parser.pdf.*
             </Export-Package>
             <Import-Package>
               *,
               com.adobe.xmp;resolution:=optional,
               com.adobe.xmp.properties;resolution:=optional,
-              android.util;resolution:=optional
+              android.util;resolution:=optional,
+              javax.mail;resolution:=optional,
+              javax.mail.internet;resolution:=optional,
+              org.bouncycastle.cert;resolution:=optional,
+              org.bouncycastle.cert.jcajce;resolution:=optional,
+              org.bouncycastle.cert.ocsp;resolution:=optional,
+              org.bouncycastle.cms.bc;resolution:=optional,
+              org.bouncycastle.operator;resolution:=optional,
+              org.bouncycastle.operator.bc;resolution:=optional,
+              org.bouncycastle.tsp;resolution:=optional,
             </Import-Package>
           </instructions>
         </configuration>

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java
 
b/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java
index 9823899..53f3299 100644
--- 
a/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java
+++ 
b/tika-parser-bundles/tika-parser-multimedia-bundle/src/test/java/org/apache/tika/module/multimedia/BundleIT.java
@@ -87,7 +87,7 @@ public class BundleIT {
     @Test
     public void testServicesCreated() throws Exception {
         ServiceReference[] services = 
bc.getAllServiceReferences(Parser.class.getName(), null);
-        assertEquals("Not all Services have started", 15, services.length);
+        assertEquals("Not all Services have started", 16, services.length);
     }
 
     @Test

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml 
b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
deleted file mode 100644
index fe1a269..0000000
--- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
+++ /dev/null
@@ -1,109 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
-  license agreements. See the NOTICE file distributed with this work for 
additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-bundles</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-pdf-bundle</artifactId>
-  <packaging>bundle</packaging>
-  <name>Apache Tika parser pdf bundle</name>
-  <url>http://tika.apache.org/</url>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-pdf-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.felix</groupId>
-        <artifactId>maven-bundle-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <instructions>
-            
<Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
-            <Embed-Dependency>
-              tika-parser-pdf-module;inline=true,
-              tika-parser-multimedia-module;inline=true,
-              tika-parser-xmp-commons;inline=true,
-              commons-io;inline=true,
-              pdfbox;inline=true,
-              pdfbox-tools;inline=true,
-              pdfbox-debugger;inline=true,
-              bcmail-jdk15on;inline=true,
-              bcprov-jdk15on;inline=true,
-              fontbox;inline=true,
-              jempbox;inline=true,
-              bcpkix-jdk15on;inline=true
-            </Embed-Dependency> 
-            <Embed-Transitive>true</Embed-Transitive>
-            <Export-Package>
-              org.apache.tika.parser.pdf.*
-            </Export-Package>
-            <Import-Package>
-              *,
-              com.ibm.icu.text;resolution:=optional,
-              com.coremedia.iso;resolution:=optional,
-              com.coremedia.iso.boxes;resolution:=optional,
-              com.coremedia.iso.boxes.apple;resolution:=optional,
-              com.coremedia.iso.boxes.sampleentry;resolution:=optional,
-              com.drew.imaging.jpeg;resolution:=optional,
-              com.drew.imaging.riff;resolution:=optional,
-              com.drew.imaging.tiff;resolution:=optional,
-              com.drew.imaging.webp;resolution:=optional,
-              com.drew.lang;resolution:=optional,
-              com.drew.metadata;resolution:=optional,
-              com.drew.metadata.exif;resolution:=optional,
-              com.drew.metadata.iptc;resolution:=optional,
-              com.drew.metadata.jpeg;resolution:=optional,
-              com.googlecode.mp4parser;resolution:=optional,
-              com.googlecode.mp4parser.boxes.apple;resolution:=optional,
-              com.googlecode.mp4parser.util;resolution:=optional,
-              javax.mail;resolution:=optional,
-              javax.mail.internet;resolution:=optional,
-              org.bouncycastle.cert;resolution:=optional,
-              org.bouncycastle.cert.jcajce;resolution:=optional,
-              org.bouncycastle.cert.ocsp;resolution:=optional,
-              org.bouncycastle.cms.bc;resolution:=optional,
-              org.bouncycastle.operator;resolution:=optional,
-              org.bouncycastle.operator.bc;resolution:=optional,
-              org.bouncycastle.tsp;resolution:=optional,
-              org.apache.commons.exec;resolution:=optional,
-              org.apache.commons.exec.environment;resolution:=optional
-            </Import-Package>
-          </instructions>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-failsafe-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
-</project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
 
b/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
deleted file mode 100644
index 8e1d010..0000000
--- 
a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.pdf;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNotNull;
-import static org.ops4j.pax.exam.CoreOptions.bundle;
-import static org.ops4j.pax.exam.CoreOptions.junitBundles;
-import static org.ops4j.pax.exam.CoreOptions.options;
-import static org.ops4j.pax.exam.CoreOptions.mavenBundle;
-
-import javax.inject.Inject;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.net.URISyntaxException;
-import java.util.Dictionary;
-
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.osgi.TikaService;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.ops4j.pax.exam.Configuration;
-import org.ops4j.pax.exam.Option;
-import org.ops4j.pax.exam.junit.PaxExam;
-import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy;
-import org.ops4j.pax.exam.spi.reactors.PerMethod;
-import org.osgi.framework.Bundle;
-import org.osgi.framework.BundleContext;
-import org.osgi.framework.ServiceReference;
-import org.xml.sax.ContentHandler;
-
-@RunWith(PaxExam.class)
-@ExamReactorStrategy(PerMethod.class)
-public class BundleIT {
-
-    private static final String BUNDLE_JAR_SYS_PROP = "project.bundle.file";
-
-    @Inject
-    private BundleContext bc;
-
-    @Configuration
-    public Option[] configuration() throws IOException, URISyntaxException {
-        String bundleFileName = System.getProperty(BUNDLE_JAR_SYS_PROP);
-        return options(junitBundles(), 
-                bundle(new 
File("target/test-bundles/tika-core.jar").toURI().toURL().toString()),
-                bundle(new File(bundleFileName).toURI().toString()));
-    }
-
-    @Test
-    public void testBundleLoaded() throws Exception {
-        boolean hasCore = false, hasBundle = false;
-        for (Bundle b : bc.getBundles()) {
-            if ("org.apache.tika.core".equals(b.getSymbolicName())) {
-                hasCore = true;
-                assertEquals("Core not activated", Bundle.ACTIVE, 
b.getState());
-            }
-            if 
("org.apache.tika.parser-pdf-bundle".equals(b.getSymbolicName())) {
-                hasBundle = true;
-                assertEquals("Bundle not activated", Bundle.ACTIVE, 
b.getState());
-            }
-        }
-        assertTrue("Core bundle not found", hasCore);
-        assertTrue("PDF bundle not found", hasBundle);
-    }
-    
-    @Test
-    public void testServicesCreated() throws Exception {
-        ServiceReference[] services = 
bc.getAllServiceReferences(Parser.class.getName(), null);
-        assertEquals("Not all Services have started", 15, services.length);
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-bundles/tika-parser-pdf-bundle/test-bundles.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/test-bundles.xml 
b/tika-parser-bundles/tika-parser-pdf-bundle/test-bundles.xml
deleted file mode 100644
index 53bb6e4..0000000
--- a/tika-parser-bundles/tika-parser-pdf-bundle/test-bundles.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-  -->
-<assembly 
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2";
-          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
-          
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2
 http://maven.apache.org/xsd/assembly-1.1.2.xsd";>
-  <id>bundles</id>
-  <formats>
-    <format>dir</format>
-  </formats>
-  <includeBaseDirectory>false</includeBaseDirectory>
-  <dependencySets>
-    <dependencySet>
-      <outputDirectory/>
-      <outputFileNameMapping>${artifact.artifactId}.jar</outputFileNameMapping>
-      <includes>
-        <include>org.apache.tika:tika-core</include>
-      </includes>
-    </dependencySet>
-  </dependencySets>
-</assembly>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index 6912f8b..cd13ba7 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -56,7 +56,6 @@
     <module>tika-parser-multimedia-module</module>
     <module>tika-parser-office-module</module>
     <module>tika-parser-package-module</module>
-    <module>tika-parser-pdf-module</module>
     <module>tika-parser-scientific-module</module>
     <module>tika-parser-text-module</module>
     <module>tika-parser-web-module</module>

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-journal-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-journal-module/pom.xml 
b/tika-parser-modules/tika-parser-journal-module/pom.xml
index c45c2a9..8c2f3d7 100644
--- a/tika-parser-modules/tika-parser-journal-module/pom.xml
+++ b/tika-parser-modules/tika-parser-journal-module/pom.xml
@@ -45,7 +45,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-pdf-module</artifactId>
+      <artifactId>tika-parser-multimedia-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml 
b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index 74cb504..9bdc5eb 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -71,6 +71,34 @@
       <version>${pdfbox.version}</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>pdfbox</artifactId>
+      <version>${pdfbox.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>pdfbox-tools</artifactId>
+      <version>${pdfbox.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>jempbox</artifactId>
+      <version>${jempbox.version}</version>
+    </dependency>
+    <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
+         as optional, but we prefer to have them always to avoid
+         problems with encrypted PDFs. -->
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcmail-jdk15on</artifactId>
+      <version>${bouncycastle.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcprov-jdk15on</artifactId>
+      <version>${bouncycastle.version}</version>
+    </dependency>
+    <dependency>
       <groupId>commons-logging</groupId>
       <artifactId>commons-logging</artifactId>
       <version>${commons.logging.version}</version>
@@ -87,6 +115,37 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-package-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-office-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <!-- Copied from PDFBox:
+       For legal reasons (incompatible license), jai-imageio-core is to be used
+       only in the tests and may not be distributed. See also LEGAL-195 -->
+    <dependency>
+      <groupId>com.github.jai-imageio</groupId>
+      <artifactId>jai-imageio-core</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   
   <build>

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
new file mode 100644
index 0000000..832b06e
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -0,0 +1,579 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.image.BufferedImage;
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
+import javax.xml.stream.XMLStreamException;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import 
org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
+class AbstractPDF2XHTML extends PDFTextStripper {
+
+    /**
+     * Maximum recursive depth during AcroForm processing.
+     * Prevents theoretical AcroForm recursion bomb.
+     */
+    private final static int MAX_ACROFORM_RECURSIONS = 10;
+
+    private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new 
TesseractOCRConfig();
+
+    /**
+     * Format used for signature dates
+     * TODO Make this thread-safe
+     */
+    private final SimpleDateFormat dateFormat = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
+
+
+    final List<IOException> exceptions = new ArrayList<>();
+    final PDDocument pdDocument;
+    final XHTMLContentHandler xhtml;
+    private final ParseContext context;
+    private final Metadata metadata;
+    final PDFParserConfig config;
+
+    private int pageIndex = 0;
+
+    AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, 
ParseContext context, Metadata metadata,
+                      PDFParserConfig config) throws IOException {
+        this.pdDocument = pdDocument;
+        this.xhtml = new XHTMLContentHandler(handler, metadata);
+        this.context = context;
+        this.metadata = metadata;
+        this.config = config;
+    }
+
+    @Override
+    protected void startPage(PDPage page) throws IOException {
+        try {
+            xhtml.startElement("div", "class", "page");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a page", e);
+        }
+        writeParagraphStart();
+    }
+
+    EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
+        EmbeddedDocumentExtractor extractor =
+                context.get(EmbeddedDocumentExtractor.class);
+        if (extractor == null) {
+            extractor = new ParsingEmbeddedDocumentExtractor(context);
+        }
+        return extractor;
+    }
+
+    private void extractEmbeddedDocuments(PDDocument document)
+            throws IOException, SAXException, TikaException {
+        PDDocumentNameDictionary namesDictionary =
+                new PDDocumentNameDictionary(document.getDocumentCatalog());
+        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
+        if (efTree == null) {
+            return;
+        }
+
+        Map<String, PDComplexFileSpecification> embeddedFileNames = 
efTree.getNames();
+        //For now, try to get the embeddedFileNames out of embeddedFiles or 
its kids.
+        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
+        //If there is a need we could add a fully recursive search to find a 
non-null
+        //Map<String, COSObjectable> that contains the doc info.
+        if (embeddedFileNames != null) {
+            processEmbeddedDocNames(embeddedFileNames);
+        } else {
+            List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
+            if (kids == null) {
+                return;
+            }
+            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+                embeddedFileNames = node.getNames();
+                if (embeddedFileNames != null) {
+                    processEmbeddedDocNames(embeddedFileNames);
+                }
+            }
+        }
+    }
+
+    private void processEmbeddedDocNames(Map<String, 
PDComplexFileSpecification> embeddedFileNames)
+            throws IOException, SAXException, TikaException {
+        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
+            return;
+        }
+
+        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+        for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
+            PDComplexFileSpecification spec = ent.getValue();
+            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+        }
+    }
+
+    private void extractMultiOSPDEmbeddedFiles(String displayName,
+                                       PDComplexFileSpecification spec,
+                                       EmbeddedDocumentExtractor extractor) 
throws IOException,
+            SAXException, TikaException {
+
+        if (spec == null) {
+            return;
+        }
+        //current strategy is to pull all, not just first non-null
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFile(), spec.getEmbeddedFile(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+    }
+
+    private void extractPDEmbeddedFile(String displayName, String 
unicodeFileName,
+                                       String fileName, PDEmbeddedFile file,
+                                       EmbeddedDocumentExtractor extractor)
+            throws SAXException, IOException, TikaException {
+
+        if (file == null) {
+            //skip silently
+            return;
+        }
+        
+        fileName = (fileName == null) ? displayName : fileName;
+
+        // TODO: other metadata?
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
+
+        if (extractor.shouldParseEmbedded(metadata)) {
+            TikaInputStream stream = null;
+            try {
+                stream = TikaInputStream.get(file.createInputStream());
+                extractor.parseEmbedded(
+                        stream,
+                        new EmbeddedContentHandler(xhtml),
+                        metadata, false);
+
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", fileName);
+                xhtml.startElement("div", attributes);
+                xhtml.endElement("div");
+            } finally {
+                IOUtils.closeQuietly(stream);
+            }
+        }
+    }
+
+    void handleCatchableIOE(IOException e) throws IOException {
+        if (config.isCatchIntermediateIOExceptions()) {
+            String msg = e.getMessage();
+            if (msg == null) {
+                msg = "IOException, no message";
+            }
+            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
+            exceptions.add(e);
+        } else {
+            throw e;
+        }
+    }
+
+    void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
+        if (config.getOCRStrategy().equals(NO_OCR)) {
+            return;
+        }
+        TesseractOCRConfig tesseractConfig =
+                context.get(TesseractOCRConfig.class, 
DEFAULT_TESSERACT_CONFIG);
+
+        TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
+        if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
+            throw new TikaException("Tesseract is not available. "+
+                    "Please set the OCR_STRATEGY to NO_OCR or configure 
Tesseract correctly");
+        }
+
+        PDFRenderer renderer = new PDFRenderer(pdDocument);
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            BufferedImage image = renderer.renderImage(pageIndex, 2.0f, 
config.getOCRImageType());
+            Path tmpFile = tmp.createTempFile();
+            try (OutputStream os = Files.newOutputStream(tmpFile)) {
+                //TODO: get output format from TesseractConfig
+                ImageIOUtil.writeImage(image, config.getOCRImageFormatName(),
+                        os, config.getOCRDPI());
+            }
+            try (InputStream is = TikaInputStream.get(tmpFile)) {
+                tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
+            }
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("error writing OCR content from 
PDF", e);
+        } finally {
+            tmp.dispose();
+        }
+    }
+
+    @Override
+    protected void endPage(PDPage page) throws IOException {
+
+        try {
+            EmbeddedDocumentExtractor extractor = 
getEmbeddedDocumentExtractor();
+            for (PDAnnotation annotation : page.getAnnotations()) {
+
+                if (annotation instanceof PDAnnotationFileAttachment) {
+                    PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
+                    PDComplexFileSpecification fileSpec = 
(PDComplexFileSpecification) fann.getFile();
+                    try {
+                        
extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
+                    } catch (SAXException e) {
+                        throw new IOExceptionWithCause("file embedded in 
annotation sax exception", e);
+                    } catch (TikaException e) {
+                        throw new IOExceptionWithCause("file embedded in 
annotation tika exception", e);
+                    } catch (IOException e) {
+                        handleCatchableIOE(e);
+                    }
+                }
+                // TODO: remove once PDFBOX-1143 is fixed:
+                if (config.getExtractAnnotationText()) {
+                    if (annotation instanceof PDAnnotationLink) {
+                        PDAnnotationLink annotationlink = (PDAnnotationLink) 
annotation;
+                        if (annotationlink.getAction() != null) {
+                            PDAction action = annotationlink.getAction();
+                            if (action instanceof PDActionURI) {
+                                PDActionURI uri = (PDActionURI) action;
+                                String link = uri.getURI();
+                                if (link != null) {
+                                    xhtml.startElement("div", "class", 
"annotation");
+                                    xhtml.startElement("a", "href", link);
+                                    xhtml.endElement("a");
+                                    xhtml.endElement("div");
+                                }
+                            }
+                        }
+                    }
+
+                    if (annotation instanceof PDAnnotationMarkup) {
+                        PDAnnotationMarkup annotationMarkup = 
(PDAnnotationMarkup) annotation;
+                        String title = annotationMarkup.getTitlePopup();
+                        String subject = annotationMarkup.getSubject();
+                        String contents = annotationMarkup.getContents();
+                        // TODO: maybe also annotationMarkup.getRichContents()?
+                        if (title != null || subject != null || contents != 
null) {
+                            xhtml.startElement("div", "class", "annotation");
+
+                            if (title != null) {
+                                xhtml.startElement("div", "class", 
"annotationTitle");
+                                xhtml.characters(title);
+                                xhtml.endElement("div");
+                            }
+
+                            if (subject != null) {
+                                xhtml.startElement("div", "class", 
"annotationSubject");
+                                xhtml.characters(subject);
+                                xhtml.endElement("div");
+                            }
+
+                            if (contents != null) {
+                                xhtml.startElement("div", "class", 
"annotationContents");
+                                xhtml.characters(contents);
+                                xhtml.endElement("div");
+                            }
+
+                            xhtml.endElement("div");
+                        }
+                    }
+                }
+            }
+            if 
(config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION))
 {
+                doOCROnCurrentPage();
+            }
+            xhtml.endElement("div");
+        } catch (SAXException|TikaException e) {
+            throw new IOExceptionWithCause("Unable to end a page", e);
+        } catch (IOException e) {
+            exceptions.add(e);
+        } finally {
+            pageIndex++;
+        }
+    }
+
+    @Override
+    protected void startDocument(PDDocument pdf) throws IOException {
+        try {
+            xhtml.startDocument();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a document", e);
+        }
+    }
+
+    @Override
+    protected void endDocument(PDDocument pdf) throws IOException {
+        try {
+            // Extract text for any bookmarks:
+            extractBookmarkText();
+            try {
+                extractEmbeddedDocuments(pdf);
+            } catch (IOException e) {
+                handleCatchableIOE(e);
+            }
+
+            //extract acroform data at end of doc
+            if (config.getExtractAcroFormContent() == true) {
+                try {
+                    extractAcroForm(pdf);
+                } catch (IOException e) {
+                    handleCatchableIOE(e);
+                }
+            }
+            xhtml.endDocument();
+        } catch (TikaException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        }
+    }
+
+    void extractBookmarkText() throws SAXException {
+        PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
+        if (outline != null) {
+            extractBookmarkText(outline);
+        }
+    }
+
+    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+        PDOutlineItem current = bookmark.getFirstChild();
+        if (current != null) {
+            xhtml.startElement("ul");
+            while (current != null) {
+                xhtml.startElement("li");
+                xhtml.characters(current.getTitle());
+                xhtml.endElement("li");
+                // Recurse:
+                extractBookmarkText(current);
+                current = current.getNextSibling();
+            }
+            xhtml.endElement("ul");
+        }
+    }
+
+    void extractAcroForm(PDDocument pdf) throws IOException,
+            SAXException {
+        //Thank you, Ben Litchfield, for 
org.apache.pdfbox.examples.fdf.PrintFields
+        //this code derives from Ben's code
+        PDDocumentCatalog catalog = pdf.getDocumentCatalog();
+
+        if (catalog == null)
+            return;
+
+        PDAcroForm form = catalog.getAcroForm();
+        if (form == null)
+            return;
+
+        //if it has xfa, try that.
+        //if it doesn't exist or there's an exception,
+        //go with traditional AcroForm
+        PDXFAResource pdxfa = form.getXFA();
+
+        if (pdxfa != null) {
+            //if successful, return
+            XFAExtractor xfaExtractor = new XFAExtractor();
+            try (InputStream is = new BufferedInputStream(
+                    new ByteArrayInputStream(pdxfa.getBytes()))) {
+                xfaExtractor.extract(is, xhtml, metadata, context);
+                return;
+            } catch (XMLStreamException |IOException e) {
+                //if there was an xml parse exception in xfa, try the AcroForm
+            }
+        }
+
+        @SuppressWarnings("rawtypes")
+        List fields = form.getFields();
+
+        if (fields == null)
+            return;
+
+        @SuppressWarnings("rawtypes")
+        ListIterator itr = fields.listIterator();
+
+        if (itr == null)
+            return;
+
+        xhtml.startElement("div", "class", "acroform");
+        xhtml.startElement("ol");
+
+        while (itr.hasNext()) {
+            Object obj = itr.next();
+            if (obj != null && obj instanceof PDField) {
+                processAcroField((PDField) obj, 0);
+            }
+        }
+        xhtml.endElement("ol");
+        xhtml.endElement("div");
+    }
+
+    private void processAcroField(PDField field, final int 
currentRecursiveDepth)
+            throws SAXException, IOException {
+
+        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+            return;
+        }
+        addFieldString(field);
+        if (field instanceof PDNonTerminalField) {
+            int r = currentRecursiveDepth + 1;
+            xhtml.startElement("ol");
+            for (PDField child : ((PDNonTerminalField)field).getChildren()) {
+                processAcroField(child, r);
+            }
+            xhtml.endElement("ol");
+        }
+    }
+
+    private void addFieldString(PDField field) throws SAXException {
+        //Pick partial name to present in content and altName for attribute
+        //Ignoring FullyQualifiedName for now
+        String partName = field.getPartialName();
+        String altName = field.getAlternateFieldName();
+
+        StringBuilder sb = new StringBuilder();
+        AttributesImpl attrs = new AttributesImpl();
+
+        if (partName != null) {
+            sb.append(partName).append(": ");
+        }
+        if (altName != null) {
+            attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+        }
+        //return early if PDSignature field
+        if (field instanceof PDSignatureField) {
+            handleSignature(attrs, (PDSignatureField) field);
+            return;
+        }
+        String value = field.getValueAsString();
+        if (value != null && !value.equals("null")) {
+            sb.append(value);
+        }
+
+        if (attrs.getLength() > 0 || sb.length() > 0) {
+            xhtml.startElement("li", attrs);
+            xhtml.characters(sb.toString());
+            xhtml.endElement("li");
+        }
+    }
+
+    private void handleSignature(AttributesImpl parentAttributes, 
PDSignatureField sigField)
+            throws SAXException {
+
+        PDSignature sig = sigField.getSignature();
+        if (sig == null) {
+            return;
+        }
+        Map<String, String> vals = new TreeMap<>();
+        vals.put("name", sig.getName());
+        vals.put("contactInfo", sig.getContactInfo());
+        vals.put("location", sig.getLocation());
+        vals.put("reason", sig.getReason());
+
+        Calendar cal = sig.getSignDate();
+        if (cal != null) {
+            dateFormat.setTimeZone(cal.getTimeZone());
+            vals.put("date", dateFormat.format(cal.getTime()));
+        }
+        //see if there is any data
+        int nonNull = 0;
+        for (String val : vals.keySet()) {
+            if (val != null && !val.equals("")) {
+                nonNull++;
+            }
+        }
+        //if there is, process it
+        if (nonNull > 0) {
+            xhtml.startElement("li", parentAttributes);
+
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
+
+            xhtml.startElement("ol", attrs);
+            for (Map.Entry<String, String> e : vals.entrySet()) {
+                if (e.getValue() == null || e.getValue().equals("")) {
+                    continue;
+                }
+                attrs = new AttributesImpl();
+                attrs.addAttribute("", "signdata", "signdata", "CDATA", 
e.getKey());
+                xhtml.startElement("li", attrs);
+                xhtml.characters(e.getValue());
+                xhtml.endElement("li");
+            }
+            xhtml.endElement("ol");
+            xhtml.endElement("li");
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
new file mode 100644
index 0000000..775e590
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Checks whether or not a document allows extraction generally
+ * or extraction for accessibility only.
+ */
+public class AccessChecker implements Serializable {
+
+    private static final long serialVersionUID = 6492570218190936986L;
+
+    private final boolean needToCheck;
+    private final boolean allowAccessibility;
+
+    /**
+     * This constructs an {@link AccessChecker} that
+     * will not perform any checking and will always return without
+     * throwing an exception.
+     * <p/>
+     * This constructor is available to allow for Tika's legacy ( <= v1.7) 
behavior.
+     */
+    public AccessChecker() {
+        needToCheck = false;
+        allowAccessibility = true;
+    }
+
+    /**
+     * This constructs an {@link AccessChecker} that will check
+     * for whether or not content should be extracted from a document.
+     *
+     * @param allowExtractionForAccessibility if general extraction is not 
allowed, is extraction for accessibility allowed
+     */
+    public AccessChecker(boolean allowExtractionForAccessibility) {
+        needToCheck = true;
+        this.allowAccessibility = allowExtractionForAccessibility;
+    }
+
+    /**
+     * Checks to see if a document's content should be extracted based
+     * on metadata values and the value of {@link #allowAccessibility} in the 
constructor.
+     *
+     * @param metadata
+     * @throws AccessPermissionException if access is not permitted
+     */
+    public void check(Metadata metadata) throws AccessPermissionException {
+        if (!needToCheck) {
+            return;
+        }
+        if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
+            if (allowAccessibility) {
+                if 
("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
+                    return;
+                }
+                throw new AccessPermissionException("Content extraction for 
accessibility is not allowed.");
+            }
+            throw new AccessPermissionException("Content extraction is not 
allowed.");
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
new file mode 100644
index 0000000..3ad551d
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to integrate text extraction via OCR only.
+ *
+ */
+class OCR2XHTML extends AbstractPDF2XHTML {
+
+    private OCR2XHTML(PDDocument document, ContentHandler handler, 
ParseContext context, Metadata metadata,
+                      PDFParserConfig config)
+            throws IOException {
+        super(document, handler, context, metadata, config);
+    }
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     *
+     * @param document PDF document
+     * @param handler  SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException  if the content handler fails to process SAX events
+     * @throws TikaException if there was an exception outside of per page 
processing
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, ParseContext context, 
Metadata metadata,
+            PDFParserConfig config)
+            throws SAXException, TikaException {
+        OCR2XHTML ocr2XHTML = null;
+        try {
+            ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, 
config);
+            ocr2XHTML.writeText(document, new Writer() {
+                @Override
+                public void write(char[] cbuf, int off, int len) {
+                }
+
+                @Override
+                public void flush() {
+                }
+
+                @Override
+                public void close() {
+                }
+            });
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+        if (ocr2XHTML.exceptions.size() > 0) {
+            //throw the first
+            throw new TikaException("Unable to extract all PDF content",
+                    ocr2XHTML.exceptions.get(0));
+        }
+    }
+
+    @Override
+    public void processPage(PDPage pdPage) throws IOException {
+        try {
+            startPage(pdPage);
+            doOCROnCurrentPage();
+            endPage(pdPage);
+        } catch (TikaException |SAXException e) {
+            throw new IOExceptionWithCause(e);
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        }
+    }
+
+    @Override
+    protected void writeString(String text) throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeCharacters(TextPosition text) throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeWordSeparator() throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeLineSeparator() throws IOException {
+        //no-op
+    }
+
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
new file mode 100644
index 0000000..ac9823e
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -0,0 +1,339 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Writer;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
+import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
+import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends AbstractPDF2XHTML {
+
+
+    private static final List<String> JPEG = Arrays.asList(
+            COSName.DCT_DECODE.getName(),
+            COSName.DCT_DECODE_ABBREVIATION.getName());
+
+    /**
+     * This keeps track of the pdf object ids for inline
+     * images that have been processed.
+     * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
+     * is true, this will be checked before extracting an embedded image.
+     * The integer keeps track of the inlineImageCounter for that image.
+     * This integer is used to identify images in the markup.
+     *
+     * This is used across the document.  To avoid infinite recursion
+     * TIKA-1742, we're limiting the export to one image per page.
+     */
+    private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
+    private int inlineImageCounter = 0;
+    private PDF2XHTML(PDDocument document, ContentHandler handler, 
ParseContext context, Metadata metadata,
+                      PDFParserConfig config)
+            throws IOException {
+        super(document, handler, context, metadata, config);
+    }
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     *
+     * @param document PDF document
+     * @param handler  SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException  if the content handler fails to process SAX events
+     * @throws TikaException if there was an exception outside of per page 
processing
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, ParseContext context, 
Metadata metadata,
+            PDFParserConfig config)
+            throws SAXException, TikaException {
+        PDF2XHTML pdf2XHTML = null;
+        try {
+            // Extract text using a dummy Writer as we override the
+            // key methods to output to the given content
+            // handler.
+            pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, 
config);
+
+            config.configure(pdf2XHTML);
+
+            pdf2XHTML.writeText(document, new Writer() {
+                @Override
+                public void write(char[] cbuf, int off, int len) {
+                }
+
+                @Override
+                public void flush() {
+                }
+
+                @Override
+                public void close() {
+                }
+            });
+
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+        if (pdf2XHTML.exceptions.size() > 0) {
+            //throw the first
+            throw new TikaException("Unable to extract all PDF content",
+                    pdf2XHTML.exceptions.get(0));
+        }
+    }
+
+
+    @Override
+    public void processPage(PDPage page) throws IOException {
+        try {
+            super.processPage(page);
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        }
+    }
+
+    @Override
+    protected void endPage(PDPage page) throws IOException {
+        try {
+            writeParagraphEnd();
+            try {
+                extractImages(page.getResources(), new HashSet<COSBase>());
+            } catch (IOException e) {
+                handleCatchableIOE(e);
+            }
+            super.endPage(page);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a page", e);
+        } catch (IOException e) {
+            exceptions.add(e);
+        }
+    }
+
+    private void extractImages(PDResources resources, Set<COSBase> 
seenThisPage) throws SAXException, IOException {
+        if (resources == null || config.getExtractInlineImages() == false) {
+            return;
+        }
+
+        for (COSName name : resources.getXObjectNames()) {
+
+            PDXObject object = resources.getXObject(name);
+            if (object == null) {
+                continue;
+            }
+            COSStream cosStream = object.getCOSObject();
+            if (seenThisPage.contains(cosStream)) {
+                //avoid infinite recursion TIKA-1742
+                continue;
+            }
+            seenThisPage.add(cosStream);
+
+            if (object instanceof PDFormXObject) {
+                extractImages(((PDFormXObject) object).getResources(), 
seenThisPage);
+            } else if (object instanceof PDImageXObject) {
+
+                PDImageXObject image = (PDImageXObject) object;
+
+                Metadata metadata = new Metadata();
+                String extension = image.getSuffix();
+                if (extension == null) {
+                    metadata.set(Metadata.CONTENT_TYPE, "image/png");
+                    extension = "png";
+                } else if (extension.equals("jpg")) {
+                    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+                } else if (extension.equals("tiff")) {
+                    metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+                    extension = "tif";
+                } else {
+                    //TODO: determine if we need to add more image types
+                    //throw new RuntimeException("EXTEN:" + extension);
+                }
+
+                Integer imageNumber = processedInlineImages.get(cosStream);
+                if (imageNumber == null) {
+                    imageNumber = inlineImageCounter++;
+                }
+                String fileName = "image" + imageNumber + "."+extension;
+                metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+
+                // Output the img tag
+                AttributesImpl attr = new AttributesImpl();
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
fileName);
+                attr.addAttribute("", "alt", "alt", "CDATA", fileName);
+                xhtml.startElement("img", attr);
+                xhtml.endElement("img");
+
+                //Do we only want to process unique COSObject ids?
+                //If so, have we already processed this one?
+                if (config.getExtractUniqueInlineImagesOnly() == true) {
+                    if (processedInlineImages.containsKey(cosStream)) {
+                        continue;
+                    }
+                    processedInlineImages.put(cosStream, imageNumber);
+                }
+
+                metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                        
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+                EmbeddedDocumentExtractor extractor =
+                        getEmbeddedDocumentExtractor();
+                if (extractor.shouldParseEmbedded(metadata)) {
+                    ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+                    try {
+                        //TODO: handle image.getMetadata()?
+                        writeToBuffer(image, extension, buffer);
+                        extractor.parseEmbedded(
+                                new ByteArrayInputStream(buffer.toByteArray()),
+                                new EmbeddedContentHandler(xhtml),
+                                metadata, false);
+                    } catch (IOException e) {
+                        handleCatchableIOE(e);
+                    }
+                }
+            }
+        }
+    }
+
+    //nearly directly copied from PDFBox ExtractImages
+    private void writeToBuffer(PDImageXObject pdImage, String suffix, 
OutputStream out)
+            throws IOException {
+
+        BufferedImage image = pdImage.getImage();
+        if (image != null) {
+            if ("jpg".equals(suffix)) {
+                String colorSpaceName = pdImage.getColorSpace().getName();
+                //TODO: figure out if we want directJPEG as a configuration
+                //previously: if (directJPeg || PDDeviceGray....
+                if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
+                        PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) 
{
+                    // RGB or Gray colorspace: get and write the 
unmodifiedJPEG stream
+                    InputStream data = 
pdImage.getStream().createInputStream(JPEG);
+                    org.apache.pdfbox.io.IOUtils.copy(data, out);
+                    org.apache.pdfbox.io.IOUtils.closeQuietly(data);
+                } else {
+                    // for CMYK and other "unusual" colorspaces, the JPEG will 
be converted
+                    ImageIOUtil.writeImage(image, suffix, out);
+                }
+            } else {
+                ImageIOUtil.writeImage(image, suffix, out);
+            }
+        }
+        out.flush();
+    }
+
+    @Override
+    protected void writeParagraphStart() throws IOException {
+        super.writeParagraphStart();
+        try {
+            xhtml.startElement("p");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a paragraph", e);
+        }
+    }
+
+    @Override
+    protected void writeParagraphEnd() throws IOException {
+        super.writeParagraphEnd();
+        try {
+            xhtml.endElement("p");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a paragraph", e);
+        }
+    }
+
+    @Override
+    protected void writeString(String text) throws IOException {
+        try {
+            xhtml.characters(text);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a string: " + text, e);
+        }
+    }
+
+    @Override
+    protected void writeCharacters(TextPosition text) throws IOException {
+        try {
+            xhtml.characters(text.getUnicode());
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a character: " + text.getUnicode(), e);
+        }
+    }
+
+    @Override
+    protected void writeWordSeparator() throws IOException {
+        try {
+            xhtml.characters(getWordSeparator());
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a space character", e);
+        }
+    }
+
+    @Override
+    protected void writeLineSeparator() throws IOException {
+        try {
+            xhtml.newline();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a newline character", e);
+        }
+    }
+
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
new file mode 100644
index 0000000..057f833
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.io.RandomAccessBuffer;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.pdfparser.COSParser;
+
+/**
+ * In fairly rare cases, a PDF's XMP will contain a string that
+ * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and
+ * ascii for ascii, e.g. 
"\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000"
+ * <p>
+ * This class can be used to decode those strings.
+ * <p>
+ * See TIKA-1678.  Many thanks to Andrew Jackson for raising this issue
+ * and Tilman Hausherr for the solution.
+ * <p>
+ * As of this writing, we are only handling strings that start with
+ * an encoded BOM.  Andrew Jackson found a handful of other examples (e.g.
+ * this ISO-8859-7 string:
+ * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336
+ * \\364\\347\\362 PRAKSIS \\363\\364\\357")
+ * that we aren't currently handling.
+ */
+class PDFEncodedStringDecoder {
+
+    private static final String[] PDF_ENCODING_BOMS = {
+            "\\376\\377", //UTF-16BE
+            "\\377\\376", //UTF-16LE
+            "\\357\\273\\277"//UTF-8
+    };
+
+    /**
+     * Does this string contain an octal-encoded UTF BOM?
+     * Call this statically to determine if you should bother creating a new 
parser to parse it.
+     * @param s
+     * @return
+     */
+    static boolean shouldDecode(String s) {
+        if (s == null || s.length() < 8) {
+            return false;
+        }
+        for (String BOM : PDF_ENCODING_BOMS) {
+            if (s.startsWith(BOM)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * This assumes that {@link #shouldDecode(String)} has been called
+     * and has returned true.  If you run this on a non-octal encoded string,
+     * disaster will happen!
+     *
+     * @param value
+     * @return
+     */
+    String decode(String value) {
+        try {
+            byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1);
+            InputStream is = new ByteArrayInputStream(bytes);
+            COSStringParser p = new COSStringParser(new 
RandomAccessBuffer(is));
+            String parsed = p.myParseCOSString();
+            if (parsed != null) {
+                return parsed;
+            }
+        } catch (IOException e) {
+            //oh well, we tried.
+        }
+        //just return value if something went wrong
+        return value;
+    }
+
+    class COSStringParser extends COSParser {
+
+        COSStringParser(RandomAccessRead buffer) throws IOException {
+            super(buffer);
+        }
+
+        /**
+         *
+         * @return parsed string or null if something went wrong.
+         */
+        String myParseCOSString() {
+            try {
+                COSString cosString = parseCOSString();
+                if (cosString != null) {
+                    return cosString.getString();
+                }
+            } catch (IOException e) {
+            }
+            return null;
+        }
+    }
+}

Reply via email to