[3/5] tika git commit: TIKA-2059 - Merge multimedia and pdf parser modules and bundles

bob Sun, 28 Aug 2016 09:30:32 -0700

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
new file mode 100644
index 0000000..d16d3c3
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -0,0 +1,1240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing pdf files.
+ */
+public class PDFParserTest extends TikaTest {
+
+    public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
+    public static final MediaType TYPE_EMF = MediaType.application("x-emf");
+    public static final MediaType TYPE_PDF = MediaType.application("pdf");
+    public static final MediaType TYPE_DOCX = 
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+    public static final MediaType TYPE_DOC = MediaType.application("msword");
+    public static Level PDFBOX_LOG_LEVEL = Level.INFO;
+    private static Boolean hasTesseract = null;
+
+    public static boolean canRunOCR() {
+        if (hasTesseract != null) {
+            return hasTesseract;
+        }
+        hasTesseract = new TesseractOCRParser().hasTesseract(new 
TesseractOCRConfig());
+        return hasTesseract;
+    }
+
+    @BeforeClass
+    public static void setup() {
+        //remember default logging level, but turn off for PDFParserTest
+        PDFBOX_LOG_LEVEL = Logger.getLogger("org.apache.pdfbox").getLevel();
+        Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
+    }
+
+    @AfterClass
+    public static void tearDown() {
+        //return to regular logging level
+        Logger.getLogger("org.apache.pdfbox").setLevel(PDFBOX_LOG_LEVEL);
+    }
+
+    private static int substringCount(String needle, String haystack) {
+        int upto = -1;
+        int count = 0;
+        while (true) {
+            final int next = haystack.indexOf(needle, upto);
+            if (next == -1) {
+                break;
+            }
+            count++;
+            upto = next + 1;
+        }
+
+        return count;
+    }
+
+    @Test
+    public void testPdfParsing() throws Exception {
+        XMLResult r = getXML("testPDF.pdf");
+        assertEquals("application/pdf", r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Bertrand Delacr\u00e9taz", 
r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Bertrand Delacr\u00e9taz", 
r.metadata.get(Metadata.AUTHOR));
+        assertEquals("Firefox", 
r.metadata.get(TikaCoreProperties.CREATOR_TOOL));
+        assertEquals("Apache Tika - Apache Tika", 
r.metadata.get(TikaCoreProperties.TITLE));
+
+        // Can't reliably test dates yet - see TIKA-451
+//        assertEquals("Sat Sep 15 10:02:31 BST 2007", 
metadata.get(Metadata.CREATION_DATE));
+//        assertEquals("Sat Sep 15 10:02:31 BST 2007", 
metadata.get(Metadata.LAST_MODIFIED));
+
+        assertContains("Apache Tika", r.xml);
+        assertContains("Tika - Content Analysis Toolkit", r.xml);
+        assertContains("incubator", r.xml);
+        assertContains("Apache Software Foundation", r.xml);
+        // testing how the end of one paragraph is separated from start of the 
next one
+
+        // should have word boundary after headline
+        assertNotContained("ToolkitApache", r.xml);
+        // should have word boundary between paragraphs
+        assertNotContained("libraries.Apache", r.xml);
+    }
+
+    @Test
+    public void testPdfParsingMetadataOnly() throws Exception {
+        XMLResult r = getXML("testPDF.pdf");
+        assertEquals("application/pdf", r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Bertrand Delacr\u00e9taz", 
r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Firefox", 
r.metadata.get(TikaCoreProperties.CREATOR_TOOL));
+        assertEquals("Apache Tika - Apache Tika", 
r.metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    @Test
+    public void testCustomMetadata() throws Exception {
+        XMLResult r = getXML("testPDF-custommetadata.pdf");
+
+        assertEquals("application/pdf", r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Document author", 
r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Document author", r.metadata.get(Metadata.AUTHOR));
+        assertEquals("Document title", 
r.metadata.get(TikaCoreProperties.TITLE));
+
+        assertEquals("Custom Value", r.metadata.get("Custom Property"));
+
+        assertEquals("Array Entry 1", r.metadata.get("Custom Array"));
+        assertEquals(2, r.metadata.getValues("Custom Array").length);
+        assertEquals("Array Entry 1", r.metadata.getValues("Custom Array")[0]);
+        assertEquals("Array Entry 2", r.metadata.getValues("Custom Array")[1]);
+
+        assertContains("Hello World!", r.xml);
+    }
+
+    /**
+     * PDFs can be "protected" with the default password. This means
+     * they're encrypted (potentially both text and metadata),
+     * but we can decrypt them easily.
+     */
+    @Test
+    public void testProtectedPDF() throws Exception {
+
+        XMLResult r = getXML("testPDF_protected.pdf");
+        Metadata metadata = r.metadata;
+        assertEquals("true", metadata.get("pdf:encrypted"));
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("The Bank of England", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
+        assertEquals("Speeches by Andrew G Haldane", 
metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Speeches by Andrew G Haldane", 
metadata.get(Metadata.SUBJECT));
+        assertEquals("Rethinking the Financial Network, Speech by Andrew G 
Haldane, Executive Director, Financial Stability delivered at the Financial 
Student Association, Amsterdam on 28 April 2009", 
metadata.get(TikaCoreProperties.TITLE));
+
+        assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
+        assertContains("On 16 November 2002", r.xml);
+        assertContains("In many important respects", r.xml);
+
+
+        // Try again with an explicit empty password
+        metadata = new Metadata();
+
+        ParseContext context = new ParseContext();
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "";
+            }
+        });
+        r = getXML("testPDF_protected.pdf", context);
+        metadata = r.metadata;
+        assertEquals("true", metadata.get("pdf:encrypted"));
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("The Bank of England", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Speeches by Andrew G Haldane", 
metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Speeches by Andrew G Haldane", 
metadata.get(Metadata.SUBJECT));
+        assertEquals("Rethinking the Financial Network, Speech by Andrew G 
Haldane, Executive Director, Financial Stability delivered at the Financial 
Student Association, Amsterdam on 28 April 2009", 
metadata.get(TikaCoreProperties.TITLE));
+
+        assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
+        assertContains("On 16 November 2002", r.xml);
+        assertContains("In many important respects", r.xml);
+
+        //now test wrong password
+        metadata = new Metadata();
+        context = new ParseContext();
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "WRONG!!!!";
+            }
+        });
+
+        boolean ex = false;
+        ContentHandler handler = new BodyContentHandler();
+        metadata = new Metadata();
+        try {
+            r = getXML("testPDF_protected.pdf", new AutoDetectParser(), 
metadata, context);
+        } catch (EncryptedDocumentException e) {
+            ex = true;
+        }
+
+        assertTrue("encryption exception", ex);
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("true", metadata.get("pdf:encrypted"));
+        //pdf:encrypted, X-Parsed-By and Content-Type
+        assertEquals("very little metadata should be parsed", 3, 
metadata.names().length);
+        assertEquals(0, handler.toString().length());
+
+    }
+
+    @Test
+    public void testTwoTextBoxes() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        String content;
+        try(InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTwoTextBoxes.pdf")) {
+            content = getText(stream, parser);
+        }
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column 
line 1 Right column line 2", content);
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+
+
+        String content;
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFVarious.pdf")) {
+            content = getText(stream, parser, metadata);
+        }
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 
Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 
column 2", content.replaceAll("\\s+"," "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains("Â·\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            assertContains(row + ") Number bullet " + row, content);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(Metadata.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        assertEquals("Subject is here",
+                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
 content);
+
+        assertContains("And then some Gothic text:", content);
+        // TODO: I saved the word doc as a PDF, but that
+        // process somehow, apparently lost the gothic
+        // chars, so we cannot test this here:
+        
//assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
 content);
+    }
+
+    @Test
+    public void testAnnotations() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        String content;
+        try(InputStream stream = 
getResourceAsStream("/test-documents/testAnnotations.pdf")){
+            content = getText(stream, parser);
+        }
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertContains("Here is a comment", content);
+
+        // Test w/ annotation text disabled:
+        PDFParser pdfParser = new PDFParser();
+        pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
+        try(InputStream stream = 
getResourceAsStream("/test-documents/testAnnotations.pdf")) {
+            content = getText(stream, pdfParser);
+        }
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
+
+        // annotation text disabled through parsecontext
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractAnnotationText(false);
+        context.set(PDFParserConfig.class, config);
+        try (InputStream stream = 
getResourceAsStream("/test-documents/testAnnotations.pdf")) {
+            content = getText(stream, parser, context);
+        }
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
+
+
+        // TIKA-738: make sure no extra </p> tags
+        String xml = getXML("testAnnotations.pdf").xml;
+        assertEquals(substringCount("<p>", xml),
+                substringCount("</p>", xml));
+    }
+
+    // TIKA-981
+    @Test
+    public void testPopupAnnotation() throws Exception {
+        XMLResult r = getXML("testPopupAnnotation.pdf");
+        assertContains("this is the note", r.xml);
+        assertContains("igalsh", r.xml);
+    }
+
+    @Test
+    public void testEmbeddedPDFs() throws Exception {
+        String xml = getXML("testPDFPackage.pdf").xml;
+        assertContains("PDF1", xml);
+        assertContains("PDF2", xml);
+    }
+
+    @Test
+    public void testPageNumber() throws Exception {
+        final XMLResult result = getXML("testPageNumber.pdf");
+        final String content = result.xml.replaceAll("\\s+", "");
+        assertContains("<p>1</p>", content);
+    }
+
+    /**
+     * Test to ensure that Links are extracted from the text
+     * <p/>
+     * Note - the PDF contains the text "This is a hyperlink" which
+     * a hyperlink annotation, linking to the tika site, on it. This
+     * test will need updating when we're able to apply the annotation
+     * to the text itself, rather than following on afterwards as now
+     */
+    @Test
+    public void testLinks() throws Exception {
+        final XMLResult result = getXML("testPDFVarious.pdf");
+        assertContains("<div class=\"annotation\"><a 
href=\"http://tika.apache.org/\"; /></div>", result.xml);
+    }
+
+    @Test
+    public void testDisableAutoSpace() throws Exception {
+        PDFParser parser = new PDFParser();
+        parser.getPDFParserConfig().setEnableAutoSpace(false);
+        XMLResult r = getXML("testExtraSpaces.pdf", parser);
+
+        String content = r.xml.replaceAll("[\\s\u00a0]+", " ");
+        // Text is correct when autoSpace is off:
+        assertContains("Here is some formatted text", content);
+
+        parser.getPDFParserConfig().setEnableAutoSpace(true);
+        r = getXML("testExtraSpaces.pdf", parser);
+        content = r.xml.replaceAll("[\\s\u00a0]+", " ");
+        // Text is correct when autoSpace is off:
+
+        // Text has extra spaces when autoSpace is on
+        assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+        //now try with autodetect
+        Parser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        //default is true
+        r = getXML("testExtraSpaces.pdf", autoParser, context);
+        content = r.xml.replaceAll("[\\s\u00a0]+", " ");
+        // Text has extra spaces when autoSpace is on
+        assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+        config.setEnableAutoSpace(false);
+        r = getXML("testExtraSpaces.pdf", parser, context);
+        content = r.xml.replaceAll("[\\s\u00a0]+", " ");
+
+        // Text is correct when autoSpace is off:
+        assertContains("Here is some formatted text", content);
+
+    }
+
+    @Test
+    public void testDuplicateOverlappingText() throws Exception {
+        PDFParser parser = new PDFParser();
+        // Default is false (keep overlapping text):
+        XMLResult r = getXML("testOverlappingText.pdf", parser);
+        assertContains("Text the first timeText the second time", r.xml);
+
+        parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true);
+        r = getXML("testOverlappingText.pdf", parser);
+        // "Text the first" was dedup'd:
+        assertContains("Text the first timesecond time", r.xml);
+
+        //now try with autodetect
+        Parser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        r = getXML("testOverlappingText.pdf", autoParser, context);
+        // Default is false (keep overlapping text):
+        assertContains("Text the first timeText the second time", r.xml);
+
+        config.setSuppressDuplicateOverlappingText(true);
+        r = getXML("testOverlappingText.pdf", autoParser, context);
+        // "Text the first" was dedup'd:
+        assertContains("Text the first timesecond time", r.xml);
+
+    }
+
+    @Test
+    public void testSortByPosition() throws Exception {
+        PDFParser parser = new PDFParser();
+        parser.getPDFParserConfig().setEnableAutoSpace(false);
+        InputStream stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        // Default is false (do not sort):
+        String content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column 
line 1 Right column line 2", content);
+
+        parser.getPDFParserConfig().setSortByPosition(true);
+        stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn 
line 2 Right column line 2", content);
+
+        //now try setting autodetect via parsecontext        
+        AutoDetectParser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        // Default is false (do not sort):
+        content = getText(stream, autoParser, context);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column 
line 1 Right column line 2", content);
+
+        config.setSortByPosition(true);
+        context.set(PDFParserConfig.class, config);
+        stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn 
line 2 Right column line 2", content);
+
+    }
+
+    // TIKA-1035
+    @Test
+    public void testBookmarks() throws Exception {
+        String xml = getXML("testPDF_bookmarks.pdf").xml;
+        int i = xml.indexOf("Denmark bookmark is here");
+        int j = xml.indexOf("</body>");
+        assertTrue(i != -1);
+        assertTrue(j != -1);
+        assertTrue(i < j);
+    }
+
+    //TIKA-1124
+    @Test
+    public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
+       /* format of test doc:
+         docx/
+            pdf/
+               docx
+       */
+
+        String content = getXML("testPDFEmbeddingAndEmbedded.docx").xml;
+        int outerHaystack = content.indexOf("Outer_haystack");
+        int pdfHaystack = content.indexOf("pdf_haystack");
+        int needle = content.indexOf("Needle");
+        assertTrue(outerHaystack > -1);
+        assertTrue(pdfHaystack > -1);
+        assertTrue(needle > -1);
+        assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
+
+        TrackingHandler tracker = new TrackingHandler();
+
+        ContainerExtractor ex = new ParserContainerExtractor();
+        try (TikaInputStream tis =
+                     
TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx")))
 {
+            ex.extract(tis, ex, tracker);
+        }
+
+        assertEquals(3, tracker.filenames.size());
+        assertEquals(3, tracker.mediaTypes.size());
+        assertEquals("image1.emf", tracker.filenames.get(0));
+        assertNull(tracker.filenames.get(1));
+        assertEquals("Test.docx", tracker.filenames.get(2));
+        assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
+        assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
+        assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
+    }
+
+
+    // TIKA-973
+    //commented out until test documents that are unambiguously
+    //consistent with Apache License v2.0 are contributed.
+    //TODO: add back test for AcroForm extraction; test document should include
+    //recursive forms
+/*    public void testAcroForm() throws Exception{
+       Parser p = new AutoDetectParser();
+       ParseContext context = new ParseContext();
+       InputStream stream = 
getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
+       String txt = getText(stream, p, context);
+       stream.close();
+
+       //simple first level form contents
+       assertContains("to: John Doe", txt);
+       //checkbox
+       assertContains("xpackaging: Yes", txt);
+       
+       //this guarantees that the form processor
+       //worked recursively at least once...i.e. it didn't just
+       //take the first form
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
+       txt = getText(stream, p, context);
+       stream.close();
+       assertContains("123 Main St.", txt);
+       
+       
+       //now test with nonsequential parser
+       PDFParserConfig config = new PDFParserConfig();
+       config.setUseNonSequentialParser(true);
+       context.set(PDFParserConfig.class, config);
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
+       txt = getText(stream, p, context);
+       stream.close();
+       
+       //simple first level form contents
+       assertContains("to: John Doe", txt);
+       //checkbox
+       assertContains("xpackaging: Yes", txt);
+       
+       //this guarantees that the form processor
+       //worked recursively at least once...i.e. it didn't just
+       //take the first form
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
+       txt = getText(stream, p, context);
+       assertContains("123 Main St.", txt);
+       stream.close();     
+    }
+*/
+
+    //TIKA-1226
+    @Test
+    public void testSignatureInAcroForm() throws Exception {
+        //The current test doc does not contain any content in the signature 
area.
+        //This just tests that a RuntimeException is not thrown.
+        //TODO: find a better test file for this issue.
+        String xml = getXML("testPDF_acroform3.pdf").xml;
+        assertTrue("found", (xml.contains("<li>aTextField: TIKA-1226</li>")));
+    }
+
+    @Test // TIKA-1228, TIKA-1268
+    public void testEmbeddedFilesInChildren() throws Exception {
+        String xml = getXML("testPDF_childAttachments.pdf").xml;
+        //"regressiveness" exists only in Unit10.doc not in the container pdf 
document
+        assertTrue(xml.contains("regressiveness"));
+
+        RecursiveParserWrapper p = new RecursiveParserWrapper(new 
AutoDetectParser(),
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        context.set(org.apache.tika.parser.Parser.class, p);
+
+        try (TikaInputStream tis = TikaInputStream.get(
+                
getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
+            p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
+        }
+
+        List<Metadata> metadatas = p.getMetadata();
+
+        assertEquals(5, metadatas.size());
+        assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("image0.jpg", 
metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("Press Quality(1).joboptions", 
metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("Unit10.doc", 
metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals(MediaType.image("jpeg").toString(), 
metadatas.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals(MediaType.image("tiff").toString(), 
metadatas.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=ISO-8859-1", 
metadatas.get(3).get(Metadata.CONTENT_TYPE));
+        assertEquals(TYPE_DOC.toString(), 
metadatas.get(4).get(Metadata.CONTENT_TYPE));
+    }
+
+
+    @Test
+    public void testEmbeddedFilesInAnnotations() throws Exception {
+        String xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;
+
+        assertTrue(xml.contains("This is a Excel"));
+    }
+
+    @Test
+    public void testSingleCloseDoc() throws Exception {
+        //TIKA-1341
+        Parser p = new AutoDetectParser();
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new EventCountingHandler();
+        try(InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTripleLangTitle.pdf")) {
+            p.parse(is, h, m, c);
+        }
+        assertEquals(1, ((EventCountingHandler) h).getEndDocument());
+    }
+
+    @Test
+    public void testVersions() throws Exception {
+
+        Map<String, String> dcFormat = new HashMap<String, String>();
+        dcFormat.put("4.x", "application/pdf; version=1.3");
+        dcFormat.put("5.x", "application/pdf; version=1.4");
+        dcFormat.put("6.x", "application/pdf; version=1.5");
+        dcFormat.put("7.x", "application/pdf; version=1.6");
+        dcFormat.put("8.x", "application/pdf; version=1.7");
+        dcFormat.put("9.x", "application/pdf; version=1.7");
+        dcFormat.put("10.x", "application/pdf; version=1.7");
+        dcFormat.put("11.x.PDFA-1b", "application/pdf; version=1.7");
+
+        Map<String, String> pdfVersions = new HashMap<String, String>();
+        pdfVersions.put("4.x", "1.3");
+        pdfVersions.put("5.x", "1.4");
+        pdfVersions.put("6.x", "1.5");
+        pdfVersions.put("7.x", "1.6");
+        pdfVersions.put("8.x", "1.7");
+        pdfVersions.put("9.x", "1.7");
+        pdfVersions.put("10.x", "1.7");
+        pdfVersions.put("11.x.PDFA-1b", "1.7");
+
+        Map<String, String> pdfExtensionVersions = new HashMap<String, 
String>();
+        pdfExtensionVersions.put("9.x", "1.7 Adobe Extension Level 3");
+        pdfExtensionVersions.put("10.x", "1.7 Adobe Extension Level 8");
+        pdfExtensionVersions.put("11.x.PDFA-1b", "1.7 Adobe Extension Level 
8");
+
+        for (Map.Entry<String, String> e : dcFormat.entrySet()) {
+            String fName = "testPDF_Version." + e.getKey() + ".pdf";
+
+            XMLResult r = getXML(fName);
+            boolean foundDC = false;
+            String[] vals = r.metadata.getValues("dc:format");
+            for (String v : vals) {
+                if (v.equals(e.getValue())) {
+                    foundDC = true;
+                }
+            }
+            assertTrue("dc:format ::" + e.getValue(), foundDC);
+            String extensionVersionTruth = 
pdfExtensionVersions.get(e.getKey());
+            if (extensionVersionTruth != null) {
+                assertEquals("pdf:PDFExtensionVersion :: " + 
extensionVersionTruth,
+                        extensionVersionTruth,
+                        r.metadata.get("pdf:PDFExtensionVersion"));
+            }
+            assertEquals("pdf:PDFVersion", pdfVersions.get(e.getKey()),
+                    r.metadata.get("pdf:PDFVersion"));
+        }
+        //now test full 11.x
+        XMLResult r = getXML("testPDF_Version.11.x.PDFA-1b.pdf");
+        Set<String> versions = new HashSet<String>();
+        for (String fmt : r.metadata.getValues("dc:format")) {
+            versions.add(fmt);
+        }
+
+        for (String hit : new String[]{"application/pdf; version=1.7",
+                "application/pdf; version=\"A-1b\"",
+                "application/pdf; version=\"1.7 Adobe Extension Level 8\""
+        }) {
+            assertTrue(hit, versions.contains(hit));
+        }
+
+        assertEquals("pdfaid:conformance", 
r.metadata.get("pdfaid:conformance"), "B");
+        assertEquals("pdfaid:part", r.metadata.get("pdfaid:part"), "1");
+    }
+
+    @Test
+    public void testMultipleAuthors() throws Exception {
+
+        String[] keys = new String[]{
+                "dc:creator",
+                "meta:author",
+                "creator",
+                "Author"
+        };
+        XMLResult r = getXML("testPDF_twoAuthors.pdf");
+
+        for (String k : keys) {
+            String[] vals = r.metadata.getValues(k);
+            assertEquals("number of authors == 2 for key: " + k, 2, 
vals.length);
+            Set<String> set = new HashSet<String>();
+            set.add(vals[0]);
+            set.add(vals[1]);
+            assertTrue("Sample Author 1", set.contains("Sample Author 1"));
+            assertTrue("Sample Author 2", set.contains("Sample Author 2"));
+        }
+    }
+
+    //STUB test for once TIKA-1295 is fixed
+    @Test
+    public void testMultipleTitles() throws Exception {
+        XMLResult r = getXML("testPDFTripleLangTitle.pdf");
+        //TODO: add other tests as part of TIKA-1295
+        //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
+        //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting 
xmp?
+        //
+        assertEquals("Hello World", r.metadata.get("dc:title"));
+    }
+
+    @Test
+    public void testInlineSelector() throws Exception {
+
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        context.set(org.apache.tika.parser.Parser.class, new 
AutoDetectParser());
+
+        List<Metadata> metadatas = 
getRecursiveMetadata("testPDF_childAttachments.pdf", context);
+        int inline = 0;
+        int attach = 0;
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(2, inline);
+        assertEquals(2, attach);
+
+        //now try turning off inline
+
+        context.set(org.apache.tika.extractor.DocumentSelector.class, new 
AvoidInlineSelector());
+        inline = 0;
+        attach = 0;
+
+        metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", 
context);
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(0, inline);
+        assertEquals(2, attach);
+
+    }
+
+    @Test
+    public void testInlineConfig() throws Exception {
+
+        List<Metadata> metadatas = 
getRecursiveMetadata("testPDF_childAttachments.pdf");
+        int inline = 0;
+        int attach = 0;
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(0, inline);
+        assertEquals(2, attach);
+
+        //now try turning off inline
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        context.set(org.apache.tika.parser.Parser.class, new 
AutoDetectParser());
+        inline = 0;
+        attach = 0;
+
+        metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", 
context);
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(2, inline);
+        assertEquals(2, attach);
+    }
+
+    @Test //TIKA-1376
+    public void testEmbeddedFileNameExtraction() throws Exception {
+        List<Metadata> metadatas = 
getRecursiveMetadata("testPDF_multiFormatEmbFiles.pdf");
+        assertEquals("metadata size", 5, metadatas.size());
+        Metadata firstAttachment = metadatas.get(1);
+        assertEquals("attachment file name", "Test.txt", 
firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
+    }
+
+    @Test //TIKA-1374
+    public void testOSSpecificEmbeddedFileExtraction() throws Exception {
+        List<Metadata> metadatas = 
getRecursiveMetadata("testPDF_multiFormatEmbFiles.pdf");
+        assertEquals("metadata size", 5, metadatas.size());
+
+        assertEquals("file name", "Test.txt", 
metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("os specific", 
metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestMac.txt", 
metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("mac embedded", 
metadatas.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestDos.txt", 
metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("dos embedded", 
metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestUnix.txt", 
metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("unix embedded", 
metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+    }
+
+    @Test //TIKA-1427
+    public void testEmbeddedFileMarkup() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.Parser.class, parser);
+
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+
+        XMLResult r = getXML("testPDF_childAttachments.pdf", context);
+        //regular attachment
+        assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
+        //inline image
+        assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" 
/>", r.xml);
+
+        //doc embedded inside an annotation
+        r = getXML("testPDFFileEmbInAnnotation.pdf");
+        assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
+    }
+
+    //Access checker tests
+
+    @Test
+    public void testLegacyAccessChecking() throws Exception {
+        //test that default behavior doesn't throw AccessPermissionException
+        for (String file : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            String xml = getXML(file).xml;
+            assertContains("Hello World", xml);
+        }
+
+        //now try with the user password
+        PasswordProvider provider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "user";
+            }
+        };
+
+        ParseContext context = new ParseContext();
+        context.set(PasswordProvider.class, provider);
+        Parser parser = new AutoDetectParser();
+
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+        }) {
+            assertContains("Hello World", getXML(path, context).xml);
+        }
+    }
+
+    @Test
+    public void testAccessCheckingEmptyPassword() throws Exception {
+        PDFParserConfig config = new PDFParserConfig();
+
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(false));
+        Parser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        context.set(PDFParserConfig.class, config);
+
+        //test exception for empty password
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, 
AccessPermissionException.class);
+        }
+
+        config.setAccessChecker(new AccessChecker(true));
+        assertException("/test-documents/" + 
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                parser, context, AccessPermissionException.class);
+
+        assertContains("Hello World",
+                getXML("testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+                        context).xml);
+    }
+
+    @Test
+    public void testAccessCheckingUserPassword() throws Exception {
+        ParseContext context = new ParseContext();
+
+        PDFParserConfig config = new PDFParserConfig();
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(false));
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "user";
+            }
+        };
+
+        context.set(PasswordProvider.class, passwordProvider);
+        context.set(PDFParserConfig.class, config);
+
+        Parser parser = new AutoDetectParser();
+
+        //test bad passwords
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, 
EncryptedDocumentException.class);
+        }
+
+        //bad password is still a bad password
+        config.setAccessChecker(new AccessChecker(true));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, 
EncryptedDocumentException.class);
+        }
+
+        //now test documents that require this "user" password
+        assertException("/test-documents/" + 
"testPDF_no_extract_no_accessibility_owner_user.pdf",
+                parser, context, AccessPermissionException.class);
+
+        assertContains("Hello World",
+                    
getXML("testPDF_no_extract_yes_accessibility_owner_user.pdf", context).xml);
+
+        config.setAccessChecker(new AccessChecker(false));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, 
AccessPermissionException.class);
+        }
+    }
+
+    @Test
+    public void testAccessCheckingOwnerPassword() throws Exception {
+        ParseContext context = new ParseContext();
+
+        PDFParserConfig config = new PDFParserConfig();
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(true));
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "owner";
+            }
+        };
+
+        context.set(PasswordProvider.class, passwordProvider);
+        context.set(PDFParserConfig.class, config);
+
+        //with owner's password, text can be extracted, no matter the 
AccessibilityChecker's settings
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+
+            assertContains("Hello World", getXML(path, context).xml);
+        }
+
+        //really, with owner's password, all extraction is allowed
+        config.setAccessChecker(new AccessChecker(false));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertContains("Hello World", getXML(path, context).xml);
+        }
+    }
+
+    @Test
+    public void testPDFEncodedStringsInXMP() throws Exception {
+        //TIKA-1678
+        XMLResult r = getXML("testPDF_PDFEncodedStringInXMP.pdf");
+        assertEquals("Microsoft", r.metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    @Test
+    public void testXFAExtractionBasic() throws Exception {
+        XMLResult r = getXML("testPDF_XFA_govdocs1_258578.pdf");
+        //contains content existing only in the "regular" pdf
+        assertContains("Mount Rushmore National Memorial", r.xml);
+        //contains xfa fields and data
+        assertContains("<li fieldName=\"School_Name\">School Name: 
my_school</li>",
+            r.xml);
+    }
+
+    @Test
+    public void testXFAOnly() throws Exception {
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setIfXFAExtractOnlyXFA(true);
+        context.set(PDFParserConfig.class, config);
+        String xml = getXML("testPDF_XFA_govdocs1_258578.pdf", context).xml;
+        assertContains("<li fieldName=\"Room_1\">Room [1]: my_room1</li>", 
xml);
+        assertContains("</xfa_content></body></html>", xml);
+
+        assertNotContained("Mount Rushmore National Memorial", xml);
+    }
+
+    @Test
+    public void testXMPMM() throws Exception {
+        Metadata m = getXML("testPDF_twoAuthors.pdf").metadata;
+        assertEquals("uuid:0e46913c-72b9-40c0-8232-69e362abcd1e",
+                m.get(XMPMM.DOCUMENTID));
+
+        m = getXML("testPDF_Version.11.x.PDFA-1b.pdf").metadata;
+        assertEquals("uuid:cccee1fc-51b3-4b52-ac86-672af3974d25",
+                m.get(XMPMM.DOCUMENTID));
+
+        //now test for 7 elements in each parallel array
+        //from the history section
+        assertArrayEquals(new String[]{
+                "uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf",
+                "uuid:edc4279e-0d5f-465e-b13e-1298402fd11c",
+                "uuid:f565b775-43f3-4a9a-8541-e98c4115db6d",
+                "uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f",
+                "uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa",
+                "uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36",
+                "uuid:c1669773-a6ca-4bdd-aade-519030d0af00"
+        }, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID));
+
+        assertArrayEquals(new String[]{
+                "converted",
+                "converted",
+                "converted",
+                "converted",
+                "converted",
+                "converted",
+                "converted"
+        }, m.getValues(XMPMM.HISTORY_ACTION));
+
+        assertArrayEquals(new String[]{
+                "Preflight",
+                "Preflight",
+                "Preflight",
+                "Preflight",
+                "Preflight",
+                "Preflight",
+                "Preflight"
+        }, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT));
+
+        assertArrayEquals(new String[]{
+                "2014-03-04T23:50:41Z",
+                "2014-03-04T23:50:42Z",
+                "2014-03-04T23:51:34Z",
+                "2014-03-04T23:51:36Z",
+                "2014-03-04T23:51:37Z",
+                "2014-03-04T23:52:22Z",
+                "2014-03-04T23:54:48Z"
+        }, m.getValues(XMPMM.HISTORY_WHEN));
+    }
+
+    @Test
+    public void testSkipBadPage() throws Exception {
+        //test file comes from govdocs1
+        //can't use TikaTest shortcuts because of exception
+        Parser p = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler(-1);
+        Metadata m = new Metadata();
+        ParseContext context = new ParseContext();
+        boolean tikaEx = false;
+        try (InputStream is = 
getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
+            p.parse(is, handler, m, context);
+        } catch (TikaException e) {
+            tikaEx = true;
+        }
+        String content = handler.toString();
+        assertTrue("Should have thrown exception", tikaEx);
+        assertEquals(1, 
m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
+        assertContains("Unknown dir", 
m.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
+        assertContains("1309.61", content);
+
+        //now try throwing exception immediately
+        PDFParserConfig config = new PDFParserConfig();
+        config.setCatchIntermediateIOExceptions(false);
+        context.set(PDFParserConfig.class, config);
+
+        handler = new BodyContentHandler(-1);
+        m = new Metadata();
+        tikaEx = false;
+        try (InputStream is = 
getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
+            p.parse(is, handler, m, context);
+        } catch (TikaException e) {
+            tikaEx = true;
+        }
+        content = handler.toString();
+        assertTrue("Should have thrown exception", tikaEx);
+        assertEquals(0, 
m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
+        assertNotContained("1309.61", content);
+    }
+
+    @Test
+    public void testEmbeddedDocsWithOCR() throws Exception {
+        if (! canRunOCR()) { return; }
+
+        for (PDFParserConfig.OCR_STRATEGY strategy : 
PDFParserConfig.OCR_STRATEGY.values()) {
+            PDFParserConfig config = new PDFParserConfig();
+            config.setOCRStrategy(strategy);
+            ParseContext context = new ParseContext();
+            context.set(PDFParserConfig.class, config);
+            context.set(Parser.class, new AutoDetectParser());
+            //make sure everything works with regular xml _and_ with recursive
+            XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", 
context);
+            assertContains("pdf_haystack", xmlResult.xml);
+            assertContains("Haystack", xmlResult.xml);
+            assertContains("Needle", xmlResult.xml);
+            if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
+                assertContains("<div class=\"ocr\">pdf_haystack", 
xmlResult.xml);
+            } else {
+                assertNotContained("<div class=\"ocr\">pdf_haystack", 
xmlResult.xml);
+            }
+            assertEquals(4, 
getRecursiveMetadata("testPDFEmbeddingAndEmbedded.docx", context).size());
+        }
+
+    }
+
+    private void assertException(String path, Parser parser, ParseContext 
context, Class expected) {
+        boolean noEx = false;
+        InputStream is = getResourceAsStream(path);
+        try {
+            String text = getText(is, parser, context);
+            noEx = true;
+        } catch (Exception e) {
+            assertEquals("Not the right exception: " + path, expected, 
e.getClass());
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+        assertFalse(path + " should have thrown exception", noEx);
+    }
+
+    /**
+     * Simple class to count end of document events.  If functionality is 
useful,
+     * move to org.apache.tika in src/test
+     */
+    private class EventCountingHandler extends ContentHandlerDecorator {
+        private int endDocument = 0;
+
+        @Override
+        public void endDocument() {
+            endDocument++;
+        }
+
+        public int getEndDocument() {
+            return endDocument;
+        }
+    }
+
+    private class AvoidInlineSelector implements DocumentSelector {
+
+        @Override
+        public boolean select(Metadata metadata) {
+            String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null && 
v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                return false;
+            }
+            return true;
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml 
b/tika-parser-modules/tika-parser-pdf-module/pom.xml
deleted file mode 100644
index 568303c..0000000
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ /dev/null
@@ -1,126 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
-  license agreements. See the NOTICE file distributed with this work for 
additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-pdf-module</artifactId>
-  <name>Apache Tika parser pdf module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <commons.logging.version>1.1.3</commons.logging.version>
-  </properties>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-multimedia-module</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-xmp-commons</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons.io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pdfbox</groupId>
-      <artifactId>pdfbox</artifactId>
-      <version>${pdfbox.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pdfbox</groupId>
-      <artifactId>pdfbox-tools</artifactId>
-      <version>${pdfbox.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pdfbox</groupId>
-      <artifactId>jempbox</artifactId>
-      <version>${jempbox.version}</version>
-    </dependency>
-    <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
-         as optional, but we prefer to have them always to avoid
-         problems with encrypted PDFs. -->
-    <dependency>
-      <groupId>org.bouncycastle</groupId>
-      <artifactId>bcmail-jdk15on</artifactId>
-      <version>${bouncycastle.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.bouncycastle</groupId>
-      <artifactId>bcprov-jdk15on</artifactId>
-      <version>${bouncycastle.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-logging</groupId>
-      <artifactId>commons-logging</artifactId>
-      <version>${commons.logging.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-package-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-text-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-office-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <!-- Copied from PDFBox:
-       For legal reasons (incompatible license), jai-imageio-core is to be used
-       only in the tests and may not be distributed. See also LEGAL-195 -->
-    <dependency>
-      <groupId>com.github.jai-imageio</groupId>
-      <artifactId>jai-imageio-core</artifactId>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
-</project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
deleted file mode 100644
index d38a96d..0000000
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/module/pdf/internal/Activator.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.pdf.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, 
Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
deleted file mode 100644
index 832b06e..0000000
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ /dev/null
@@ -1,579 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import java.awt.image.BufferedImage;
-import java.io.BufferedInputStream;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Calendar;
-import java.util.List;
-import java.util.ListIterator;
-import java.util.Locale;
-import java.util.Map;
-import java.util.TreeMap;
-
-import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
-
-import javax.xml.stream.XMLStreamException;
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.commons.io.IOUtils;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
-import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
-import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
-import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
-import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
-import 
org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
-import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
-import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
-import org.apache.pdfbox.pdmodel.interactive.form.PDField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
-import org.apache.pdfbox.rendering.PDFRenderer;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.tools.imageio.ImageIOUtil;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.ocr.TesseractOCRConfig;
-import org.apache.tika.parser.ocr.TesseractOCRParser;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
-
-class AbstractPDF2XHTML extends PDFTextStripper {
-
-    /**
-     * Maximum recursive depth during AcroForm processing.
-     * Prevents theoretical AcroForm recursion bomb.
-     */
-    private final static int MAX_ACROFORM_RECURSIONS = 10;
-
-    private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new 
TesseractOCRConfig();
-
-    /**
-     * Format used for signature dates
-     * TODO Make this thread-safe
-     */
-    private final SimpleDateFormat dateFormat = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
-
-
-    final List<IOException> exceptions = new ArrayList<>();
-    final PDDocument pdDocument;
-    final XHTMLContentHandler xhtml;
-    private final ParseContext context;
-    private final Metadata metadata;
-    final PDFParserConfig config;
-
-    private int pageIndex = 0;
-
-    AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, 
ParseContext context, Metadata metadata,
-                      PDFParserConfig config) throws IOException {
-        this.pdDocument = pdDocument;
-        this.xhtml = new XHTMLContentHandler(handler, metadata);
-        this.context = context;
-        this.metadata = metadata;
-        this.config = config;
-    }
-
-    @Override
-    protected void startPage(PDPage page) throws IOException {
-        try {
-            xhtml.startElement("div", "class", "page");
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to start a page", e);
-        }
-        writeParagraphStart();
-    }
-
-    EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
-        EmbeddedDocumentExtractor extractor =
-                context.get(EmbeddedDocumentExtractor.class);
-        if (extractor == null) {
-            extractor = new ParsingEmbeddedDocumentExtractor(context);
-        }
-        return extractor;
-    }
-
-    private void extractEmbeddedDocuments(PDDocument document)
-            throws IOException, SAXException, TikaException {
-        PDDocumentNameDictionary namesDictionary =
-                new PDDocumentNameDictionary(document.getDocumentCatalog());
-        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
-        if (efTree == null) {
-            return;
-        }
-
-        Map<String, PDComplexFileSpecification> embeddedFileNames = 
efTree.getNames();
-        //For now, try to get the embeddedFileNames out of embeddedFiles or 
its kids.
-        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
-        //If there is a need we could add a fully recursive search to find a 
non-null
-        //Map<String, COSObjectable> that contains the doc info.
-        if (embeddedFileNames != null) {
-            processEmbeddedDocNames(embeddedFileNames);
-        } else {
-            List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
-            if (kids == null) {
-                return;
-            }
-            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                embeddedFileNames = node.getNames();
-                if (embeddedFileNames != null) {
-                    processEmbeddedDocNames(embeddedFileNames);
-                }
-            }
-        }
-    }
-
-    private void processEmbeddedDocNames(Map<String, 
PDComplexFileSpecification> embeddedFileNames)
-            throws IOException, SAXException, TikaException {
-        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
-            return;
-        }
-
-        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
-        for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
-            PDComplexFileSpecification spec = ent.getValue();
-            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
-        }
-    }
-
-    private void extractMultiOSPDEmbeddedFiles(String displayName,
-                                       PDComplexFileSpecification spec,
-                                       EmbeddedDocumentExtractor extractor) 
throws IOException,
-            SAXException, TikaException {
-
-        if (spec == null) {
-            return;
-        }
-        //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFile(), spec.getEmbeddedFile(), extractor);
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
-    }
-
-    private void extractPDEmbeddedFile(String displayName, String 
unicodeFileName,
-                                       String fileName, PDEmbeddedFile file,
-                                       EmbeddedDocumentExtractor extractor)
-            throws SAXException, IOException, TikaException {
-
-        if (file == null) {
-            //skip silently
-            return;
-        }
-        
-        fileName = (fileName == null) ? displayName : fileName;
-
-        // TODO: other metadata?
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
-        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
-        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-                TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
-        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
-
-        if (extractor.shouldParseEmbedded(metadata)) {
-            TikaInputStream stream = null;
-            try {
-                stream = TikaInputStream.get(file.createInputStream());
-                extractor.parseEmbedded(
-                        stream,
-                        new EmbeddedContentHandler(xhtml),
-                        metadata, false);
-
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", fileName);
-                xhtml.startElement("div", attributes);
-                xhtml.endElement("div");
-            } finally {
-                IOUtils.closeQuietly(stream);
-            }
-        }
-    }
-
-    void handleCatchableIOE(IOException e) throws IOException {
-        if (config.isCatchIntermediateIOExceptions()) {
-            String msg = e.getMessage();
-            if (msg == null) {
-                msg = "IOException, no message";
-            }
-            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
-            exceptions.add(e);
-        } else {
-            throw e;
-        }
-    }
-
-    void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
-        if (config.getOCRStrategy().equals(NO_OCR)) {
-            return;
-        }
-        TesseractOCRConfig tesseractConfig =
-                context.get(TesseractOCRConfig.class, 
DEFAULT_TESSERACT_CONFIG);
-
-        TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
-        if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
-            throw new TikaException("Tesseract is not available. "+
-                    "Please set the OCR_STRATEGY to NO_OCR or configure 
Tesseract correctly");
-        }
-
-        PDFRenderer renderer = new PDFRenderer(pdDocument);
-        TemporaryResources tmp = new TemporaryResources();
-        try {
-            BufferedImage image = renderer.renderImage(pageIndex, 2.0f, 
config.getOCRImageType());
-            Path tmpFile = tmp.createTempFile();
-            try (OutputStream os = Files.newOutputStream(tmpFile)) {
-                //TODO: get output format from TesseractConfig
-                ImageIOUtil.writeImage(image, config.getOCRImageFormatName(),
-                        os, config.getOCRDPI());
-            }
-            try (InputStream is = TikaInputStream.get(tmpFile)) {
-                tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
-            }
-        } catch (IOException e) {
-            handleCatchableIOE(e);
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("error writing OCR content from 
PDF", e);
-        } finally {
-            tmp.dispose();
-        }
-    }
-
-    @Override
-    protected void endPage(PDPage page) throws IOException {
-
-        try {
-            EmbeddedDocumentExtractor extractor = 
getEmbeddedDocumentExtractor();
-            for (PDAnnotation annotation : page.getAnnotations()) {
-
-                if (annotation instanceof PDAnnotationFileAttachment) {
-                    PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
-                    PDComplexFileSpecification fileSpec = 
(PDComplexFileSpecification) fann.getFile();
-                    try {
-                        
extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
-                    } catch (SAXException e) {
-                        throw new IOExceptionWithCause("file embedded in 
annotation sax exception", e);
-                    } catch (TikaException e) {
-                        throw new IOExceptionWithCause("file embedded in 
annotation tika exception", e);
-                    } catch (IOException e) {
-                        handleCatchableIOE(e);
-                    }
-                }
-                // TODO: remove once PDFBOX-1143 is fixed:
-                if (config.getExtractAnnotationText()) {
-                    if (annotation instanceof PDAnnotationLink) {
-                        PDAnnotationLink annotationlink = (PDAnnotationLink) 
annotation;
-                        if (annotationlink.getAction() != null) {
-                            PDAction action = annotationlink.getAction();
-                            if (action instanceof PDActionURI) {
-                                PDActionURI uri = (PDActionURI) action;
-                                String link = uri.getURI();
-                                if (link != null) {
-                                    xhtml.startElement("div", "class", 
"annotation");
-                                    xhtml.startElement("a", "href", link);
-                                    xhtml.endElement("a");
-                                    xhtml.endElement("div");
-                                }
-                            }
-                        }
-                    }
-
-                    if (annotation instanceof PDAnnotationMarkup) {
-                        PDAnnotationMarkup annotationMarkup = 
(PDAnnotationMarkup) annotation;
-                        String title = annotationMarkup.getTitlePopup();
-                        String subject = annotationMarkup.getSubject();
-                        String contents = annotationMarkup.getContents();
-                        // TODO: maybe also annotationMarkup.getRichContents()?
-                        if (title != null || subject != null || contents != 
null) {
-                            xhtml.startElement("div", "class", "annotation");
-
-                            if (title != null) {
-                                xhtml.startElement("div", "class", 
"annotationTitle");
-                                xhtml.characters(title);
-                                xhtml.endElement("div");
-                            }
-
-                            if (subject != null) {
-                                xhtml.startElement("div", "class", 
"annotationSubject");
-                                xhtml.characters(subject);
-                                xhtml.endElement("div");
-                            }
-
-                            if (contents != null) {
-                                xhtml.startElement("div", "class", 
"annotationContents");
-                                xhtml.characters(contents);
-                                xhtml.endElement("div");
-                            }
-
-                            xhtml.endElement("div");
-                        }
-                    }
-                }
-            }
-            if 
(config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION))
 {
-                doOCROnCurrentPage();
-            }
-            xhtml.endElement("div");
-        } catch (SAXException|TikaException e) {
-            throw new IOExceptionWithCause("Unable to end a page", e);
-        } catch (IOException e) {
-            exceptions.add(e);
-        } finally {
-            pageIndex++;
-        }
-    }
-
-    @Override
-    protected void startDocument(PDDocument pdf) throws IOException {
-        try {
-            xhtml.startDocument();
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to start a document", e);
-        }
-    }
-
-    @Override
-    protected void endDocument(PDDocument pdf) throws IOException {
-        try {
-            // Extract text for any bookmarks:
-            extractBookmarkText();
-            try {
-                extractEmbeddedDocuments(pdf);
-            } catch (IOException e) {
-                handleCatchableIOE(e);
-            }
-
-            //extract acroform data at end of doc
-            if (config.getExtractAcroFormContent() == true) {
-                try {
-                    extractAcroForm(pdf);
-                } catch (IOException e) {
-                    handleCatchableIOE(e);
-                }
-            }
-            xhtml.endDocument();
-        } catch (TikaException e) {
-            throw new IOExceptionWithCause("Unable to end a document", e);
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to end a document", e);
-        }
-    }
-
-    void extractBookmarkText() throws SAXException {
-        PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
-        if (outline != null) {
-            extractBookmarkText(outline);
-        }
-    }
-
-    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
-        PDOutlineItem current = bookmark.getFirstChild();
-        if (current != null) {
-            xhtml.startElement("ul");
-            while (current != null) {
-                xhtml.startElement("li");
-                xhtml.characters(current.getTitle());
-                xhtml.endElement("li");
-                // Recurse:
-                extractBookmarkText(current);
-                current = current.getNextSibling();
-            }
-            xhtml.endElement("ul");
-        }
-    }
-
-    void extractAcroForm(PDDocument pdf) throws IOException,
-            SAXException {
-        //Thank you, Ben Litchfield, for 
org.apache.pdfbox.examples.fdf.PrintFields
-        //this code derives from Ben's code
-        PDDocumentCatalog catalog = pdf.getDocumentCatalog();
-
-        if (catalog == null)
-            return;
-
-        PDAcroForm form = catalog.getAcroForm();
-        if (form == null)
-            return;
-
-        //if it has xfa, try that.
-        //if it doesn't exist or there's an exception,
-        //go with traditional AcroForm
-        PDXFAResource pdxfa = form.getXFA();
-
-        if (pdxfa != null) {
-            //if successful, return
-            XFAExtractor xfaExtractor = new XFAExtractor();
-            try (InputStream is = new BufferedInputStream(
-                    new ByteArrayInputStream(pdxfa.getBytes()))) {
-                xfaExtractor.extract(is, xhtml, metadata, context);
-                return;
-            } catch (XMLStreamException |IOException e) {
-                //if there was an xml parse exception in xfa, try the AcroForm
-            }
-        }
-
-        @SuppressWarnings("rawtypes")
-        List fields = form.getFields();
-
-        if (fields == null)
-            return;
-
-        @SuppressWarnings("rawtypes")
-        ListIterator itr = fields.listIterator();
-
-        if (itr == null)
-            return;
-
-        xhtml.startElement("div", "class", "acroform");
-        xhtml.startElement("ol");
-
-        while (itr.hasNext()) {
-            Object obj = itr.next();
-            if (obj != null && obj instanceof PDField) {
-                processAcroField((PDField) obj, 0);
-            }
-        }
-        xhtml.endElement("ol");
-        xhtml.endElement("div");
-    }
-
-    private void processAcroField(PDField field, final int 
currentRecursiveDepth)
-            throws SAXException, IOException {
-
-        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
-            return;
-        }
-        addFieldString(field);
-        if (field instanceof PDNonTerminalField) {
-            int r = currentRecursiveDepth + 1;
-            xhtml.startElement("ol");
-            for (PDField child : ((PDNonTerminalField)field).getChildren()) {
-                processAcroField(child, r);
-            }
-            xhtml.endElement("ol");
-        }
-    }
-
-    private void addFieldString(PDField field) throws SAXException {
-        //Pick partial name to present in content and altName for attribute
-        //Ignoring FullyQualifiedName for now
-        String partName = field.getPartialName();
-        String altName = field.getAlternateFieldName();
-
-        StringBuilder sb = new StringBuilder();
-        AttributesImpl attrs = new AttributesImpl();
-
-        if (partName != null) {
-            sb.append(partName).append(": ");
-        }
-        if (altName != null) {
-            attrs.addAttribute("", "altName", "altName", "CDATA", altName);
-        }
-        //return early if PDSignature field
-        if (field instanceof PDSignatureField) {
-            handleSignature(attrs, (PDSignatureField) field);
-            return;
-        }
-        String value = field.getValueAsString();
-        if (value != null && !value.equals("null")) {
-            sb.append(value);
-        }
-
-        if (attrs.getLength() > 0 || sb.length() > 0) {
-            xhtml.startElement("li", attrs);
-            xhtml.characters(sb.toString());
-            xhtml.endElement("li");
-        }
-    }
-
-    private void handleSignature(AttributesImpl parentAttributes, 
PDSignatureField sigField)
-            throws SAXException {
-
-        PDSignature sig = sigField.getSignature();
-        if (sig == null) {
-            return;
-        }
-        Map<String, String> vals = new TreeMap<>();
-        vals.put("name", sig.getName());
-        vals.put("contactInfo", sig.getContactInfo());
-        vals.put("location", sig.getLocation());
-        vals.put("reason", sig.getReason());
-
-        Calendar cal = sig.getSignDate();
-        if (cal != null) {
-            dateFormat.setTimeZone(cal.getTimeZone());
-            vals.put("date", dateFormat.format(cal.getTime()));
-        }
-        //see if there is any data
-        int nonNull = 0;
-        for (String val : vals.keySet()) {
-            if (val != null && !val.equals("")) {
-                nonNull++;
-            }
-        }
-        //if there is, process it
-        if (nonNull > 0) {
-            xhtml.startElement("li", parentAttributes);
-
-            AttributesImpl attrs = new AttributesImpl();
-            attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
-
-            xhtml.startElement("ol", attrs);
-            for (Map.Entry<String, String> e : vals.entrySet()) {
-                if (e.getValue() == null || e.getValue().equals("")) {
-                    continue;
-                }
-                attrs = new AttributesImpl();
-                attrs.addAttribute("", "signdata", "signdata", "CDATA", 
e.getKey());
-                xhtml.startElement("li", attrs);
-                xhtml.characters(e.getValue());
-                xhtml.endElement("li");
-            }
-            xhtml.endElement("ol");
-            xhtml.endElement("li");
-        }
-    }
-}

[3/5] tika git commit: TIKA-2059 - Merge multimedia and pdf parser modules and bundles

Reply via email to