tika-advanced-parser-m...

bob Sat, 16 Jan 2016 10:24:07 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PropertyTypeException;
+import org.junit.Test;
+
+public class AccessCheckerTest {
+
+    @Test
+    public void testLegacy() throws AccessPermissionException {
+
+        Metadata m = getMetadata(false, false);
+        //legacy behavior; don't bother checking
+        AccessChecker checker = new AccessChecker();
+        checker.check(m);
+        assertTrue("no exception", true);
+
+        m = getMetadata(false, true);
+        assertTrue("no exception", true);
+        checker.check(m);
+
+        m = getMetadata(true, true);
+        assertTrue("no exception", true);
+        checker.check(m);
+    }
+
+    @Test
+    public void testNoExtraction() {
+
+        Metadata m = null;
+        //allow nothing
+        AccessChecker checker = new AccessChecker(false);
+        boolean ex = false;
+        try {
+            m = getMetadata(false, false);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            ex = true;
+        }
+        assertTrue("correct exception with no extraction, no extract for 
accessibility", ex);
+        ex = false;
+        try {
+            //document allows extraction for accessibility
+            m = getMetadata(false, true);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            //but application is not an accessibility application
+            ex = true;
+        }
+        assertTrue("correct exception with no extraction, no extract for 
accessibility", ex);
+    }
+
+    @Test
+    public void testExtractOnlyForAccessibility() throws 
AccessPermissionException {
+        Metadata m = getMetadata(false, true);
+        //allow accessibility
+        AccessChecker checker = new AccessChecker(true);
+        checker.check(m);
+        assertTrue("no exception", true);
+        boolean ex = false;
+        try {
+            m = getMetadata(false, false);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            ex = true;
+        }
+        assertTrue("correct exception", ex);
+    }
+
+    @Test
+    public void testCrazyExtractNotForAccessibility() throws 
AccessPermissionException {
+        Metadata m = getMetadata(true, false);
+        //allow accessibility
+        AccessChecker checker = new AccessChecker(true);
+        checker.check(m);
+        assertTrue("no exception", true);
+
+        //don't extract for accessibility
+        checker = new AccessChecker(false);
+        //if extract content is allowed, the checker shouldn't
+        //check the value of extract for accessibility
+        checker.check(m);
+        assertTrue("no exception", true);
+
+    }
+
+    @Test
+    public void testCantAddMultiplesToMetadata() {
+        Metadata m = new Metadata();
+        boolean ex = false;
+        m.add(AccessPermissions.EXTRACT_CONTENT, "true");
+        try {
+            m.add(AccessPermissions.EXTRACT_CONTENT, "false");
+        } catch (PropertyTypeException e) {
+            ex = true;
+        }
+        assertTrue("can't add multiple values", ex);
+
+        m = new Metadata();
+        ex = false;
+        m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true");
+        try {
+            m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false");
+        } catch (PropertyTypeException e) {
+            ex = true;
+        }
+        assertTrue("can't add multiple values", ex);
+    }
+
+    private Metadata getMetadata(boolean allowExtraction, boolean 
allowExtractionForAccessibility) {
+        Metadata m = new Metadata();
+        m.set(AccessPermissions.EXTRACT_CONTENT, 
Boolean.toString(allowExtraction));
+        m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, 
Boolean.toString(allowExtractionForAccessibility));
+        return m;
+    }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,1377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing pdf files.
+ */
+public class PDFParserTest extends TikaTest {
+
+    public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
+    public static final MediaType TYPE_EMF = MediaType.application("x-emf");
+    public static final MediaType TYPE_PDF = MediaType.application("pdf");
+    public static final MediaType TYPE_DOCX = 
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+    public static final MediaType TYPE_DOC = MediaType.application("msword");
+    public static Level PDFBOX_LOG_LEVEL = Level.INFO;
+
+    @BeforeClass
+    public static void setup() {
+        //remember default logging level, but turn off for PDFParserTest
+        PDFBOX_LOG_LEVEL = Logger.getLogger("org.apache.pdfbox").getLevel();
+        Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
+    }
+
+    @AfterClass
+    public static void tearDown() {
+        //return to regular logging level
+        Logger.getLogger("org.apache.pdfbox").setLevel(PDFBOX_LOG_LEVEL);
+    }
+
+    private static int substringCount(String needle, String haystack) {
+        int upto = -1;
+        int count = 0;
+        while (true) {
+            final int next = haystack.indexOf(needle, upto);
+            if (next == -1) {
+                break;
+            }
+            count++;
+            upto = next + 1;
+        }
+
+        return count;
+    }
+
+    @Test
+    public void testPdfParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF.pdf");
+
+        String content = getText(stream, parser, metadata);
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Bertrand Delacr\u00e9taz", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Bertrand Delacr\u00e9taz", 
metadata.get(Metadata.AUTHOR));
+        assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
+        assertEquals("Apache Tika - Apache Tika", 
metadata.get(TikaCoreProperties.TITLE));
+
+        // Can't reliably test dates yet - see TIKA-451
+//        assertEquals("Sat Sep 15 10:02:31 BST 2007", 
metadata.get(Metadata.CREATION_DATE));
+//        assertEquals("Sat Sep 15 10:02:31 BST 2007", 
metadata.get(Metadata.LAST_MODIFIED));
+
+        assertContains("Apache Tika", content);
+        assertContains("Tika - Content Analysis Toolkit", content);
+        assertContains("incubator", content);
+        assertContains("Apache Software Foundation", content);
+        // testing how the end of one paragraph is separated from start of the 
next one
+        assertTrue("should have word boundary after headline",
+                !content.contains("ToolkitApache"));
+        assertTrue("should have word boundary between paragraphs",
+                !content.contains("libraries.Apache"));
+    }
+
+    @Test
+    public void testPdfParsingMetadataOnly() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF.pdf")) {
+            parser.parse(stream, null, metadata, new ParseContext());
+        }
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Bertrand Delacr\u00e9taz", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
+        assertEquals("Apache Tika - Apache Tika", 
metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    @Test
+    public void testCustomMetadata() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF-custommetadata.pdf");
+
+        String content = getText(stream, parser, metadata);
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Document author", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Document author", metadata.get(Metadata.AUTHOR));
+        assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
+
+        assertEquals("Custom Value", metadata.get("Custom Property"));
+
+        assertEquals("Array Entry 1", metadata.get("Custom Array"));
+        assertEquals(2, metadata.getValues("Custom Array").length);
+        assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
+        assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
+
+        assertContains("Hello World!", content);
+    }
+
+    /**
+     * PDFs can be "protected" with the default password. This means
+     * they're encrypted (potentially both text and metadata),
+     * but we can decrypt them easily.
+     */
+    @Test
+    public void testProtectedPDF() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_protected.pdf")) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        assertEquals("true", metadata.get("pdf:encrypted"));
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("The Bank of England", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
+        assertEquals("Speeches by Andrew G Haldane", 
metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Speeches by Andrew G Haldane", 
metadata.get(Metadata.SUBJECT));
+        assertEquals("Rethinking the Financial Network, Speech by Andrew G 
Haldane, Executive Director, Financial Stability delivered at the Financial 
Student Association, Amsterdam on 28 April 2009", 
metadata.get(TikaCoreProperties.TITLE));
+
+        String content = handler.toString();
+        assertContains("RETHINKING THE FINANCIAL NETWORK", content);
+        assertContains("On 16 November 2002", content);
+        assertContains("In many important respects", content);
+
+
+        // Try again with an explicit empty password
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+
+        context = new ParseContext();
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "";
+            }
+        });
+
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_protected.pdf")) {
+            parser.parse(stream, handler, metadata, context);
+        }
+        assertEquals("true", metadata.get("pdf:encrypted"));
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("The Bank of England", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Speeches by Andrew G Haldane", 
metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Speeches by Andrew G Haldane", 
metadata.get(Metadata.SUBJECT));
+        assertEquals("Rethinking the Financial Network, Speech by Andrew G 
Haldane, Executive Director, Financial Stability delivered at the Financial 
Student Association, Amsterdam on 28 April 2009", 
metadata.get(TikaCoreProperties.TITLE));
+
+        assertContains("RETHINKING THE FINANCIAL NETWORK", content);
+        assertContains("On 16 November 2002", content);
+        assertContains("In many important respects", content);
+
+        //now test wrong password
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+        context = new ParseContext();
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "WRONG!!!!";
+            }
+        });
+
+        boolean ex = false;
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_protected.pdf")) {
+            parser.parse(stream, handler, metadata, context);
+        } catch (EncryptedDocumentException e) {
+            ex = true;
+        }
+        content = handler.toString();
+
+        assertTrue("encryption exception", ex);
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("true", metadata.get("pdf:encrypted"));
+        //pdf:encrypted, X-Parsed-By and Content-Type
+        assertEquals("very little metadata should be parsed", 3, 
metadata.names().length);
+        assertEquals(0, content.length());
+
+        //now test wrong password with non sequential parser
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+        context = new ParseContext();
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            public String getPassword(Metadata metadata) {
+                return "WRONG!!!!";
+            }
+        });
+        PDFParserConfig config = new PDFParserConfig();
+        config.setUseNonSequentialParser(true);
+        context.set(PDFParserConfig.class, config);
+
+        ;
+        ex = false;
+        try (InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_protected.pdf")) {
+            parser.parse(stream, handler, metadata, context);
+        } catch (EncryptedDocumentException e) {
+            ex = true;
+        }
+        content = handler.toString();
+        assertTrue("encryption exception", ex);
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("true", metadata.get("pdf:encrypted"));
+
+        //pdf:encrypted, X-Parsed-By and Content-Type
+        assertEquals("very little metadata should be parsed", 3, 
metadata.names().length);
+        assertEquals(0, content.length());
+    }
+
+    @Test
+    public void testTwoTextBoxes() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTwoTextBoxes.pdf");
+        String content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column 
line 1 Right column line 2", content);
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        Metadata metadata = new Metadata();
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFVarious.pdf");
+
+        String content = getText(stream, parser, metadata);
+        //content = content.replaceAll("\\s+"," ");
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 
Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 
column 2", content.replaceAll("\\s+"," "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains("Â·\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            assertContains(row + ") Number bullet " + row, content);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(Metadata.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        assertEquals("Subject is here",
+                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
 content);
+
+        assertContains("And then some Gothic text:", content);
+        // TODO: I saved the word doc as a PDF, but that
+        // process somehow, apparently lost the gothic
+        // chars, so we cannot test this here:
+        
//assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
 content);
+    }
+
+    @Test
+    public void testAnnotations() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        InputStream stream = 
getResourceAsStream("/test-documents/testAnnotations.pdf");
+        String content = getText(stream, parser);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertContains("Here is a comment", content);
+
+        // Test w/ annotation text disabled:
+        PDFParser pdfParser = new PDFParser();
+        pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
+        stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
+        content = getText(stream, pdfParser);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
+
+        // annotation text disabled through parsecontext
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractAnnotationText(false);
+        context.set(PDFParserConfig.class, config);
+        stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
+        content = getText(stream, parser, context);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
+
+
+        // TIKA-738: make sure no extra </p> tags
+        String xml = getXML("testAnnotations.pdf").xml;
+        assertEquals(substringCount("<p>", xml),
+                substringCount("</p>", xml));
+    }
+
+    // TIKA-981
+    @Test
+    public void testPopupAnnotation() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        InputStream stream = 
getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
+        String content = getText(stream, parser);
+        assertContains("this is the note", content);
+        assertContains("igalsh", content);
+    }
+
+    @Test
+    public void testEmbeddedPDFs() throws Exception {
+        String xml = getXML("testPDFPackage.pdf").xml;
+        assertContains("PDF1", xml);
+        assertContains("PDF2", xml);
+    }
+
+    @Test
+    public void testPageNumber() throws Exception {
+        final XMLResult result = getXML("testPageNumber.pdf");
+        final String content = result.xml.replaceAll("\\s+", "");
+        assertContains("<p>1</p>", content);
+    }
+
+    /**
+     * Test to ensure that Links are extracted from the text
+     * <p/>
+     * Note - the PDF contains the text "This is a hyperlink" which
+     * a hyperlink annotation, linking to the tika site, on it. This
+     * test will need updating when we're able to apply the annotation
+     * to the text itself, rather than following on afterwards as now
+     */
+    @Test
+    public void testLinks() throws Exception {
+        final XMLResult result = getXML("testPDFVarious.pdf");
+        assertContains("<div class=\"annotation\"><a 
href=\"http://tika.apache.org/\"; /></div>", result.xml);
+    }
+
+    @Test
+    public void testDisableAutoSpace() throws Exception {
+        PDFParser parser = new PDFParser();
+        parser.getPDFParserConfig().setEnableAutoSpace(false);
+        InputStream stream = 
getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        String content = getText(stream, parser);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        // Text is correct when autoSpace is off:
+        assertContains("Here is some formatted text", content);
+
+        parser.getPDFParserConfig().setEnableAutoSpace(true);
+        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        content = getText(stream, parser);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        // Text is correct when autoSpace is off:
+
+        // Text has extra spaces when autoSpace is on
+        assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+        //now try with autodetect
+        Parser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        //default is true
+        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        content = getText(stream, autoParser, context);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        // Text has extra spaces when autoSpace is on
+        assertEquals(-1, content.indexOf("Here is some formatted text"));
+
+        config.setEnableAutoSpace(false);
+
+        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
+        content = getText(stream, parser, context);
+        content = content.replaceAll("[\\s\u00a0]+", " ");
+        // Text is correct when autoSpace is off:
+        assertContains("Here is some formatted text", content);
+
+    }
+
+    @Test
+    public void testDuplicateOverlappingText() throws Exception {
+        PDFParser parser = new PDFParser();
+        InputStream stream = 
getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        // Default is false (keep overlapping text):
+        String content = getText(stream, parser);
+        assertContains("Text the first timeText the second time", content);
+
+        parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true);
+        stream = 
getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        content = getText(stream, parser);
+        // "Text the first" was dedup'd:
+        assertContains("Text the first timesecond time", content);
+
+        //now try with autodetect
+        Parser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        stream = 
getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        // Default is false (keep overlapping text):
+        content = getText(stream, autoParser, context);
+        assertContains("Text the first timeText the second time", content);
+
+        config.setSuppressDuplicateOverlappingText(true);
+        stream = 
getResourceAsStream("/test-documents/testOverlappingText.pdf");
+        content = getText(stream, autoParser, context);
+        // "Text the first" was dedup'd:
+        assertContains("Text the first timesecond time", content);
+
+    }
+
+    @Test
+    public void testSortByPosition() throws Exception {
+        PDFParser parser = new PDFParser();
+        parser.getPDFParserConfig().setEnableAutoSpace(false);
+        InputStream stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        // Default is false (do not sort):
+        String content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column 
line 1 Right column line 2", content);
+
+        parser.getPDFParserConfig().setSortByPosition(true);
+        stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn 
line 2 Right column line 2", content);
+
+        //now try setting autodetect via parsecontext        
+        AutoDetectParser autoParser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        context.set(PDFParserConfig.class, config);
+        stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        // Default is false (do not sort):
+        content = getText(stream, autoParser, context);
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column 
line 1 Right column line 2", content);
+
+        config.setSortByPosition(true);
+        context.set(PDFParserConfig.class, config);
+        stream = 
getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
+        content = getText(stream, parser);
+        content = content.replaceAll("\\s+", " ");
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn 
line 2 Right column line 2", content);
+
+    }
+
+    // TIKA-1035
+    @Test
+    public void testBookmarks() throws Exception {
+        String xml = getXML("testPDF_bookmarks.pdf").xml;
+        int i = xml.indexOf("Denmark bookmark is here");
+        int j = xml.indexOf("</body>");
+        assertTrue(i != -1);
+        assertTrue(j != -1);
+        assertTrue(i < j);
+    }
+
+    //TIKA-1124
+    @Test
+    public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
+       /* format of test doc:
+         docx/
+            pdf/
+               docx
+       */
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        String content = "";
+        InputStream stream = null;
+        try {
+            context.set(org.apache.tika.parser.Parser.class, parser);
+            stream = 
getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
+            parser.parse(stream, handler, metadata, context);
+            content = handler.toString();
+        } finally {
+            stream.close();
+        }
+        int outerHaystack = content.indexOf("Outer_haystack");
+        int pdfHaystack = content.indexOf("pdf_haystack");
+        int needle = content.indexOf("Needle");
+        assertTrue(outerHaystack > -1);
+        assertTrue(pdfHaystack > -1);
+        assertTrue(needle > -1);
+        assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
+
+        TrackingHandler tracker = new TrackingHandler();
+        TikaInputStream tis;
+        ContainerExtractor ex = new ParserContainerExtractor();
+        try {
+            tis = 
TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
+            ex.extract(tis, ex, tracker);
+        } finally {
+            stream.close();
+        }
+        assertEquals(true, ex.isSupported(tis));
+        assertEquals(3, tracker.filenames.size());
+        assertEquals(3, tracker.mediaTypes.size());
+        assertEquals("image1.emf", tracker.filenames.get(0));
+        assertNull(tracker.filenames.get(1));
+        assertEquals("Test.docx", tracker.filenames.get(2));
+        assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
+        assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
+        assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
+    }
+
+    /**
+     * tests for equality between traditional sequential parser
+     * and newer nonsequential parser.
+     * <p/>
+     * TODO: more testing
+     */
+    @Test
+    public void testSequentialParser() throws Exception {
+
+        Parser sequentialParser = new AutoDetectParser();
+        Parser nonSequentialParser = new AutoDetectParser();
+
+        ParseContext seqContext = new ParseContext();
+        PDFParserConfig seqConfig = new PDFParserConfig();
+        seqConfig.setUseNonSequentialParser(false);
+        seqContext.set(PDFParserConfig.class, seqConfig);
+
+        ParseContext nonSeqContext = new ParseContext();
+        PDFParserConfig nonSeqConfig = new PDFParserConfig();
+        nonSeqConfig.setUseNonSequentialParser(true);
+        nonSeqContext.set(PDFParserConfig.class, nonSeqConfig);
+
+        File testDocs = new 
File(this.getClass().getResource("/test-documents").toURI());
+        int pdfs = 0;
+        Set<String> knownMetadataDiffs = new HashSet<String>();
+        //PDFBox-1792/Tika-1203
+        knownMetadataDiffs.add("testAnnotations.pdf");
+        // Added for TIKA-93.
+        knownMetadataDiffs.add("testOCR.pdf");
+        // Added for TIKA-1085
+        knownMetadataDiffs.add("testPDF_bom.pdf");
+
+        //empty for now
+        Set<String> knownContentDiffs = new HashSet<String>();
+
+        for (File f : testDocs.listFiles()) {
+            if (!f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
+                continue;
+            }
+
+            String sequentialContent = null;
+            Metadata sequentialMetadata = new Metadata();
+            try {
+                sequentialContent = getText(new FileInputStream(f),
+                        sequentialParser, seqContext, sequentialMetadata);
+            } catch (EncryptedDocumentException e) {
+                //silently skip a file that requires a user password
+                continue;
+            } catch (Exception e) {
+                throw new TikaException("Sequential Parser failed on test file 
" + f, e);
+            }
+
+            pdfs++;
+
+            String nonSequentialContent = null;
+            Metadata nonSequentialMetadata = new Metadata();
+            try {
+                nonSequentialContent = getText(new FileInputStream(f),
+                        nonSequentialParser, nonSeqContext, 
nonSequentialMetadata);
+            } catch (Exception e) {
+                throw new TikaException("Non-Sequential Parser failed on test 
file " + f, e);
+            }
+
+            if (knownContentDiffs.contains(f.getName())) {
+                assertFalse(f.getName(), 
sequentialContent.equals(nonSequentialContent));
+            } else {
+                assertEquals(f.getName(), sequentialContent, 
nonSequentialContent);
+            }
+
+            //skip this one file.
+            if (knownMetadataDiffs.contains(f.getName())) {
+                assertFalse(f.getName(), 
sequentialMetadata.equals(nonSequentialMetadata));
+            } else {
+                assertEquals(f.getName(), sequentialMetadata, 
nonSequentialMetadata);
+            }
+        }
+        //make sure nothing went wrong with getting the resource to 
test-documents
+        //must have tested >= 15 pdfs
+        boolean ge15 = (pdfs >= 15);
+        assertTrue("Number of pdf files tested >= 15 in non-sequential parser 
test", ge15);
+    }
+
+
+    // TIKA-973
+    //commented out until test documents that are unambiguously
+    //consistent with Apache License v2.0 are contributed.
+    //TODO: add back test for AcroForm extraction; test document should include
+    //recursive forms
+/*    public void testAcroForm() throws Exception{
+       Parser p = new AutoDetectParser();
+       ParseContext context = new ParseContext();
+       InputStream stream = 
getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
+       String txt = getText(stream, p, context);
+       stream.close();
+
+       //simple first level form contents
+       assertContains("to: John Doe", txt);
+       //checkbox
+       assertContains("xpackaging: Yes", txt);
+       
+       //this guarantees that the form processor
+       //worked recursively at least once...i.e. it didn't just
+       //take the first form
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
+       txt = getText(stream, p, context);
+       stream.close();
+       assertContains("123 Main St.", txt);
+       
+       
+       //now test with nonsequential parser
+       PDFParserConfig config = new PDFParserConfig();
+       config.setUseNonSequentialParser(true);
+       context.set(PDFParserConfig.class, config);
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
+       txt = getText(stream, p, context);
+       stream.close();
+       
+       //simple first level form contents
+       assertContains("to: John Doe", txt);
+       //checkbox
+       assertContains("xpackaging: Yes", txt);
+       
+       //this guarantees that the form processor
+       //worked recursively at least once...i.e. it didn't just
+       //take the first form
+       stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
+       txt = getText(stream, p, context);
+       assertContains("123 Main St.", txt);
+       stream.close();     
+    }
+*/
+
+    //TIKA-1226
+    @Test
+    public void testSignatureInAcroForm() throws Exception {
+        //The current test doc does not contain any content in the signature 
area.
+        //This just tests that a RuntimeException is not thrown.
+        //TODO: find a better test file for this issue.
+        String xml = getXML("/testPDF_acroform3.pdf").xml;
+        assertTrue("found", (xml.contains("<li>aTextField: TIKA-1226</li>")));
+    }
+
+    @Test // TIKA-1228, TIKA-1268
+    public void testEmbeddedFilesInChildren() throws Exception {
+        String xml = getXML("/testPDF_childAttachments.pdf").xml;
+        //"regressiveness" exists only in Unit10.doc not in the container pdf 
document
+        assertTrue(xml.contains("regressiveness"));
+
+        RecursiveParserWrapper p = new RecursiveParserWrapper(new 
AutoDetectParser(),
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        context.set(org.apache.tika.parser.Parser.class, p);
+
+        try (TikaInputStream tis = TikaInputStream.get(
+                
getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
+            p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
+        }
+
+        List<Metadata> metadatas = p.getMetadata();
+
+        assertEquals(5, metadatas.size());
+        assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("image0.jpg", 
metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("Press Quality(1).joboptions", 
metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("Unit10.doc", 
metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals(MediaType.image("jpeg").toString(), 
metadatas.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals(MediaType.image("tiff").toString(), 
metadatas.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=ISO-8859-1", 
metadatas.get(3).get(Metadata.CONTENT_TYPE));
+        assertEquals(TYPE_DOC.toString(), 
metadatas.get(4).get(Metadata.CONTENT_TYPE));
+    }
+
+
+    @Test
+    public void testEmbeddedFilesInAnnotations() throws Exception {
+        String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml;
+
+        assertTrue(xml.contains("This is a Excel"));
+    }
+
+    @Test
+    public void testSingleCloseDoc() throws Exception {
+        //TIKA-1341
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTripleLangTitle.pdf");
+        Parser p = new AutoDetectParser();
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new EventCountingHandler();
+        p.parse(is, h, m, c);
+        assertEquals(1, ((EventCountingHandler) h).getEndDocument());
+    }
+
+    @Test
+    public void testVersions() throws Exception {
+
+        Map<String, String> dcFormat = new HashMap<String, String>();
+        dcFormat.put("4.x", "application/pdf; version=1.3");
+        dcFormat.put("5.x", "application/pdf; version=1.4");
+        dcFormat.put("6.x", "application/pdf; version=1.5");
+        dcFormat.put("7.x", "application/pdf; version=1.6");
+        dcFormat.put("8.x", "application/pdf; version=1.7");
+        dcFormat.put("9.x", "application/pdf; version=1.7");
+        dcFormat.put("10.x", "application/pdf; version=1.7");
+        dcFormat.put("11.x.PDFA-1b", "application/pdf; version=1.7");
+
+        Map<String, String> pdfVersions = new HashMap<String, String>();
+        pdfVersions.put("4.x", "1.3");
+        pdfVersions.put("5.x", "1.4");
+        pdfVersions.put("6.x", "1.5");
+        pdfVersions.put("7.x", "1.6");
+        pdfVersions.put("8.x", "1.7");
+        pdfVersions.put("9.x", "1.7");
+        pdfVersions.put("10.x", "1.7");
+        pdfVersions.put("11.x.PDFA-1b", "1.7");
+
+        Map<String, String> pdfExtensionVersions = new HashMap<String, 
String>();
+        pdfExtensionVersions.put("9.x", "1.7 Adobe Extension Level 3");
+        pdfExtensionVersions.put("10.x", "1.7 Adobe Extension Level 8");
+        pdfExtensionVersions.put("11.x.PDFA-1b", "1.7 Adobe Extension Level 
8");
+
+        Parser p = new AutoDetectParser();
+        for (Map.Entry<String, String> e : dcFormat.entrySet()) {
+            String fName = "testPDF_Version." + e.getKey() + ".pdf";
+            InputStream is = PDFParserTest.class.getResourceAsStream(
+                    "/test-documents/" + fName);
+            Metadata m = new Metadata();
+            ContentHandler h = new BodyContentHandler();
+            ParseContext c = new ParseContext();
+            p.parse(is, h, m, c);
+            is.close();
+            boolean foundDC = false;
+            String[] vals = m.getValues("dc:format");
+            for (String v : vals) {
+                if (v.equals(e.getValue())) {
+                    foundDC = true;
+                }
+            }
+            assertTrue("dc:format ::" + e.getValue(), foundDC);
+            String extensionVersionTruth = 
pdfExtensionVersions.get(e.getKey());
+            if (extensionVersionTruth != null) {
+                assertEquals("pdf:PDFExtensionVersion :: " + 
extensionVersionTruth,
+                        extensionVersionTruth,
+                        m.get("pdf:PDFExtensionVersion"));
+            }
+            assertEquals("pdf:PDFVersion", pdfVersions.get(e.getKey()),
+                    m.get("pdf:PDFVersion"));
+        }
+        //now test full 11.x
+        String fName = "testPDF_Version.11.x.PDFA-1b.pdf";
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/" + fName);
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        Set<String> versions = new HashSet<String>();
+        for (String fmt : m.getValues("dc:format")) {
+            versions.add(fmt);
+        }
+
+        for (String hit : new String[]{"application/pdf; version=1.7",
+                "application/pdf; version=\"A-1b\"",
+                "application/pdf; version=\"1.7 Adobe Extension Level 8\""
+        }) {
+            assertTrue(hit, versions.contains(hit));
+        }
+
+        assertEquals("pdfaid:conformance", m.get("pdfaid:conformance"), "B");
+        assertEquals("pdfaid:part", m.get("pdfaid:part"), "1");
+    }
+
+    @Test
+    public void testMultipleAuthors() throws Exception {
+        String fName = "testPDF_twoAuthors.pdf";
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/" + fName);
+        Parser p = new AutoDetectParser();
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+
+        String[] keys = new String[]{
+                "dc:creator",
+                "meta:author",
+                "creator",
+                "Author"
+        };
+
+        for (String k : keys) {
+            String[] vals = m.getValues(k);
+            assertEquals("number of authors == 2 for key: " + k, 2, 
vals.length);
+            Set<String> set = new HashSet<String>();
+            set.add(vals[0]);
+            set.add(vals[1]);
+            assertTrue("Sample Author 1", set.contains("Sample Author 1"));
+            assertTrue("Sample Author 2", set.contains("Sample Author 2"));
+        }
+    }
+
+    //STUB test for once TIKA-1295 is fixed
+    @Test
+    public void testMultipleTitles() throws Exception {
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDFTripleLangTitle.pdf");
+        Parser p = new AutoDetectParser();
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        //TODO: add other tests as part of TIKA-1295
+        //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
+        //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting 
xmp?
+        //
+        assertEquals("Hello World", m.get("dc:title"));
+    }
+
+    @Test
+    public void testInlineSelector() throws Exception {
+
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+
+        Parser defaultParser = new AutoDetectParser();
+
+        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        context.set(org.apache.tika.parser.Parser.class, p);
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler(-1);
+        String path = "/test-documents/testPDF_childAttachments.pdf";
+        InputStream stream = 
TikaInputStream.get(this.getClass().getResource(path));
+
+        p.parse(stream, handler, metadata, context);
+
+        List<Metadata> metadatas = p.getMetadata();
+        int inline = 0;
+        int attach = 0;
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(2, inline);
+        assertEquals(2, attach);
+
+        stream.close();
+        p.reset();
+
+        //now try turning off inline
+        stream = TikaInputStream.get(this.getClass().getResource(path));
+
+        context.set(org.apache.tika.extractor.DocumentSelector.class, new 
AvoidInlineSelector());
+        inline = 0;
+        attach = 0;
+        handler = new BodyContentHandler(-1);
+        metadata = new Metadata();
+        p.parse(stream, handler, metadata, context);
+
+        metadatas = p.getMetadata();
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(0, inline);
+        assertEquals(2, attach);
+
+    }
+
+
+    @Test
+    public void testInlineConfig() throws Exception {
+
+        Parser defaultParser = new AutoDetectParser();
+        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.Parser.class, p);
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler(-1);
+        String path = "/test-documents/testPDF_childAttachments.pdf";
+        InputStream stream = 
TikaInputStream.get(this.getClass().getResource(path));
+
+        p.parse(stream, handler, metadata, context);
+
+        List<Metadata> metadatas = p.getMetadata();
+        int inline = 0;
+        int attach = 0;
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(0, inline);
+        assertEquals(2, attach);
+
+        stream.close();
+        p.reset();
+
+        //now try turning off inline
+        stream = TikaInputStream.get(this.getClass().getResource(path));
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+        inline = 0;
+        attach = 0;
+        handler = new BodyContentHandler(-1);
+        metadata = new Metadata();
+        p.parse(stream, handler, metadata, context);
+
+        metadatas = p.getMetadata();
+        for (Metadata m : metadatas) {
+            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null) {
+                if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                    inline++;
+                } else if 
(v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
+                    attach++;
+                }
+            }
+        }
+        assertEquals(2, inline);
+        assertEquals(2, attach);
+    }
+
+    @Test //TIKA-1376
+    public void testEmbeddedFileNameExtraction() throws Exception {
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
+        RecursiveParserWrapper p = new RecursiveParserWrapper(
+                new AutoDetectParser(),
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        c.set(org.apache.tika.parser.Parser.class, p);
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        List<Metadata> metadatas = p.getMetadata();
+        assertEquals("metadata size", 5, metadatas.size());
+        Metadata firstAttachment = metadatas.get(1);
+        assertEquals("attachment file name", "Test.txt", 
firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
+    }
+
+    @Test //TIKA-1374
+    public void testOSSpecificEmbeddedFileExtraction() throws Exception {
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
+        RecursiveParserWrapper p = new RecursiveParserWrapper(
+                new AutoDetectParser(),
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        c.set(org.apache.tika.parser.Parser.class, p);
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        List<Metadata> metadatas = p.getMetadata();
+        assertEquals("metadata size", 5, metadatas.size());
+
+        assertEquals("file name", "Test.txt", 
metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("os specific", 
metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestMac.txt", 
metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("mac embedded", 
metadatas.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestDos.txt", 
metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("dos embedded", 
metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("file name", "TestUnix.txt", 
metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
+        assertContains("unix embedded", 
metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+    }
+
+    @Test //TIKA-1427
+    public void testEmbeddedFileMarkup() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        context.set(org.apache.tika.parser.Parser.class, parser);
+
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new ToXMLContentHandler();
+        String path = "/test-documents/testPDF_childAttachments.pdf";
+        InputStream stream = null;
+        try {
+            stream = TikaInputStream.get(this.getClass().getResource(path));
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            IOUtils.closeQuietly(stream);
+        }
+
+        String xml = handler.toString();
+        //regular attachment
+        assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
+        //inline image
+        assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" 
/>", xml);
+
+        //doc embedded inside an annotation
+        xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;
+        assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml);
+    }
+
+    //Access checker tests
+
+    @Test
+    public void testLegacyAccessChecking() throws Exception {
+        //test that default behavior doesn't throw AccessPermissionException
+        for (String file : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            String xml = getXML(file).xml;
+            assertContains("Hello World", xml);
+        }
+
+        //now try with the user password
+        PasswordProvider provider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "user";
+            }
+        };
+
+        ParseContext context = new ParseContext();
+        context.set(PasswordProvider.class, provider);
+        Parser parser = new AutoDetectParser();
+
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+        }) {
+            InputStream stream = null;
+            try {
+                stream = 
TikaInputStream.get(this.getClass().getResource("/test-documents/" + path));
+                String text = getText(stream, parser, context);
+                assertContains("Hello World", text);
+            } finally {
+                IOUtils.closeQuietly(stream);
+            }
+        }
+    }
+
+    @Test
+    public void testAccessCheckingEmptyPassword() throws Exception {
+        PDFParserConfig config = new PDFParserConfig();
+
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(false));
+        Parser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        context.set(PDFParserConfig.class, config);
+
+        //test exception for empty password
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, 
AccessPermissionException.class);
+        }
+
+        config.setAccessChecker(new AccessChecker(true));
+        assertException("/test-documents/" + 
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                parser, context, AccessPermissionException.class);
+
+        InputStream is = null;
+        try {
+            is = getResourceAsStream("/test-documents/" + 
"testPDF_no_extract_yes_accessibility_owner_empty.pdf");
+            assertContains("Hello World", getText(is, parser, context));
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+    }
+
+    @Test
+    public void testAccessCheckingUserPassword() throws Exception {
+        ParseContext context = new ParseContext();
+
+        PDFParserConfig config = new PDFParserConfig();
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(false));
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "user";
+            }
+        };
+
+        context.set(PasswordProvider.class, passwordProvider);
+        context.set(PDFParserConfig.class, config);
+
+        Parser parser = new AutoDetectParser();
+
+        //test bad passwords
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, 
EncryptedDocumentException.class);
+        }
+
+        //bad password is still a bad password
+        config.setAccessChecker(new AccessChecker(true));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, 
EncryptedDocumentException.class);
+        }
+
+        //now test documents that require this "user" password
+        assertException("/test-documents/" + 
"testPDF_no_extract_no_accessibility_owner_user.pdf",
+                parser, context, AccessPermissionException.class);
+
+
+        InputStream is = null;
+        try {
+            is = getResourceAsStream("/test-documents/" + 
"testPDF_no_extract_yes_accessibility_owner_user.pdf");
+            assertContains("Hello World", getText(is, parser, context));
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+
+        config.setAccessChecker(new AccessChecker(false));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+        }) {
+            assertException("/test-documents/" + path, parser, context, 
AccessPermissionException.class);
+        }
+    }
+
+    @Test
+    public void testAccessCheckingOwnerPassword() throws Exception {
+        ParseContext context = new ParseContext();
+
+        PDFParserConfig config = new PDFParserConfig();
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(true));
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "owner";
+            }
+        };
+
+        context.set(PasswordProvider.class, passwordProvider);
+        context.set(PDFParserConfig.class, config);
+
+        Parser parser = new AutoDetectParser();
+        //with owner's password, text can be extracted, no matter the 
AccessibilityChecker's settings
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+
+            InputStream is = null;
+            try {
+                is = getResourceAsStream("/test-documents/" + 
"testPDF_no_extract_yes_accessibility_owner_user.pdf");
+                assertContains("Hello World", getText(is, parser, context));
+            } finally {
+                IOUtils.closeQuietly(is);
+            }
+        }
+
+        //really, with owner's password, all extraction is allowed
+        config.setAccessChecker(new AccessChecker(false));
+        for (String path : new String[]{
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+
+            InputStream is = null;
+            try {
+                is = getResourceAsStream("/test-documents/" + 
"testPDF_no_extract_yes_accessibility_owner_user.pdf");
+                assertContains("Hello World", getText(is, parser, context));
+            } finally {
+                IOUtils.closeQuietly(is);
+            }
+        }
+    }
+
+    @Test
+    public void testPDFEncodedStringsInXMP() throws Exception {
+        //TIKA-1678
+        XMLResult r = getXML("testPDF_PDFEncodedStringInXMP.pdf");
+        assertEquals("Microsoft", r.metadata.get(TikaCoreProperties.TITLE));
+    }
+
+    private void assertException(String path, Parser parser, ParseContext 
context, Class expected) {
+        boolean noEx = false;
+        InputStream is = getResourceAsStream(path);
+        try {
+            String text = getText(is, parser, context);
+            noEx = true;
+        } catch (Exception e) {
+            assertEquals("Not the right exception: " + path, expected, 
e.getClass());
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+        assertFalse(path + " should have thrown exception", noEx);
+    }
+
+    /**
+     * Simple class to count end of document events.  If functionality is 
useful,
+     * move to org.apache.tika in src/test
+     */
+    private class EventCountingHandler extends ContentHandlerDecorator {
+        private int endDocument = 0;
+
+        @Override
+        public void endDocument() {
+            endDocument++;
+        }
+
+        public int getEndDocument() {
+            return endDocument;
+        }
+    }
+
+    private class AvoidInlineSelector implements DocumentSelector {
+
+        @Override
+        public boolean select(Metadata metadata) {
+            String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+            if (v != null && 
v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
+                return false;
+            }
+            return true;
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/pom.xml?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/pom.xml 
(added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/pom.xml 
Sat Jan 16 18:23:01 2016
@@ -0,0 +1,143 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
+  license agreements. See the NOTICE file distributed with this work for 
additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-scientific-parser-module</artifactId>
+  <name>Apache Tika Scientific Parser Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <netcdf-java.version>4.5.5</netcdf-java.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-exec</artifactId>
+      <version>1.3</version>
+    </dependency>
+    <dependency>
+      <groupId>com.googlecode.json-simple</groupId>
+      <artifactId>json-simple</artifactId>
+      <version>1.1.1</version>
+      <exclusions>
+        <exclusion>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.core</groupId>
+      <artifactId>sis-utility</artifactId>
+      <version>0.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.storage</groupId>
+      <artifactId>sis-netcdf</artifactId>
+      <version>0.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.sis.core</groupId>
+      <artifactId>sis-metadata</artifactId>
+      <version>0.5</version>
+    </dependency>
+    <!-- edu.ucar dependencies -->
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>netcdf4</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>grib</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>cdm</artifactId>
+      <version>${netcdf-java.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>jcl-over-slf4j</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>httpservices</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <!-- Apache cTAKES -->
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-core</artifactId>
+      <version>3.2.2</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <!-- Upstream parser libraries -->
+    <dependency>
+      <groupId>net.sourceforge.jmatio</groupId>
+      <artifactId>jmatio</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <!-- Apache Commons CSV -->
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-text-parser-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+
+/**
+ * This enumeration includes the properties that an {@see 
IdentifiedAnnotation} object can provide.
+ *
+ */
+public enum CTAKESAnnotationProperty {
+    BEGIN("start"),
+    END("end"),
+    CONDITIONAL("conditional"),
+    CONFIDENCE("confidence"),
+    DISCOVERY_TECNIQUE("discoveryTechnique"),
+    GENERIC("generic"),
+    HISTORY_OF("historyOf"),
+    ID("id"),
+    ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"),
+    POLARITY("polarity");
+
+    private String name;
+
+    CTAKESAnnotationProperty(String name) {
+        this.name = name;
+    }
+
+    public String getName() {
+        return name;
+    }
+}
\ No newline at end of file

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java?rev=1725014&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
 Sat Jan 16 18:23:01 2016
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.util.Properties;
+
+import static org.apache.commons.io.output.NullOutputStream.NULL_OUTPUT_STREAM;
+
+/**
+ * Configuration for {@see CTAKESContentHandler}.
+ * 
+ * This class allows to enable cTAKES and set its parameters.
+ */
+public class CTAKESConfig implements Serializable {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1599741171775528923L;
+
+    // Path to XML descriptor for AnalysisEngine
+    private String aeDescriptorPath = 
"/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml";
+
+    // UMLS username
+    private String UMLSUser = "";
+
+    // UMLS password
+    private String UMLSPass = "";
+
+    // Enables formatted output
+    private boolean prettyPrint = true; 
+
+    // Type of cTAKES (UIMA) serializer
+    private CTAKESSerializer serializerType = CTAKESSerializer.XMI;
+
+    // OutputStream object used for CAS serialization
+    private OutputStream stream = NULL_OUTPUT_STREAM;
+
+    // Enables CAS serialization
+    private boolean serialize = false;
+
+    // Enables text analysis using cTAKES
+    private boolean text = true;
+
+    // List of metadata to analyze using cTAKES
+    private String[] metadata = null;
+
+    // List of annotation properties to add to metadata in addition to text 
covered by an annotation
+    private CTAKESAnnotationProperty[] annotationProps = null;
+
+    // Character used to separate the annotation properties into metadata
+    private char separatorChar = ':';
+
+    /**
+     * Default constructor.
+     */
+    public CTAKESConfig() {
+        init(this.getClass().getResourceAsStream("CTAKESConfig.properties"));
+    }
+
+    /**
+     * Loads properties from InputStream and then tries to close InputStream.
+     * @param stream {@see InputStream} object used to read properties.
+     */
+    public CTAKESConfig(InputStream stream) {
+        init(stream);
+    }
+
+    private void init(InputStream stream) {
+        if (stream == null) {
+            return;
+        }
+        Properties props = new Properties();
+
+        try {
+            props.load(stream);
+        } catch (IOException e) {
+            // TODO warning
+        } finally {
+            if (stream != null) {
+                try {
+                    stream.close();
+                } catch (IOException ioe) {
+                    // TODO warning
+                }
+            }
+        }
+
+        setAeDescriptorPath(props.getProperty("aeDescriptorPath", 
getAeDescriptorPath()));
+        setUMLSUser(props.getProperty("UMLSUser", getUMLSUser()));
+        setUMLSPass(props.getProperty("UMLSPass", getUMLSPass()));
+        setText(Boolean.valueOf(props.getProperty("text", 
Boolean.toString(isText()))));
+        setMetadata(props.getProperty("metadata", 
getMetadataAsString()).split(","));
+        setAnnotationProps(props.getProperty("annotationProps", 
getAnnotationPropsAsString()).split(","));
+        setSeparatorChar(props.getProperty("separatorChar", 
Character.toString(getSeparatorChar())).charAt(0));
+    }
+
+    /**
+     * Returns the path to XML descriptor for AnalysisEngine.
+     * @return the path to XML descriptor for AnalysisEngine.
+     */
+    public String getAeDescriptorPath() {
+        return aeDescriptorPath;
+    }
+
+    /**
+     * Returns the UMLS username.
+     * @return the UMLS username.
+     */
+    public String getUMLSUser() {
+        return UMLSUser;
+    }
+
+    /**
+     * Returns the UMLS password.
+     * @return the UMLS password.
+     */
+    public String getUMLSPass() {
+        return UMLSPass;
+    }
+
+    /**
+     * Returns {@code true} if formatted output is enabled, {@code false} 
otherwise.
+     * @return {@code true} if formatted output is enabled, {@code false} 
otherwise.
+     */
+    public boolean isPrettyPrint() {
+        return prettyPrint;
+    }
+
+    /**
+     * Returns the type of cTAKES (UIMA) serializer used to write the CAS.
+     * @return the type of cTAKES serializer.
+     */
+    public CTAKESSerializer getSerializerType() {
+        return serializerType;
+    }
+
+    /**
+     * Returns an {@see OutputStream} object used write the CAS.
+     * @return {@see OutputStream} object used write the CAS.
+     */
+    public OutputStream getOutputStream() {
+        return stream;
+    }
+
+    /**
+     * Returns {@code true} if CAS serialization is enabled, {@code false} 
otherwise.
+     * @return {@code true} if CAS serialization output is enabled, {@code 
false} otherwise.
+     */
+    public boolean isSerialize() {
+        return serialize;
+    }
+
+    /**
+     * Returns {@code true} if content text analysis is enabled {@code false} 
otherwise.
+     * @return {@code true} if content text analysis is enabled {@code false} 
otherwise.
+     */
+    public boolean isText() {
+        return text;
+    }
+
+    /**
+     * Returns an array of metadata whose values will be analyzed using cTAKES.
+     * @return an array of metadata whose values will be analyzed using cTAKES.
+     */
+    public String[] getMetadata() {
+        return metadata;
+    }
+
+    /**
+     * Returns a string containing a comma-separated list of metadata whose 
values will be analyzed using cTAKES.
+     * @return a string containing a comma-separated list of metadata whose 
values will be analyzed using cTAKES.
+     */
+    public String getMetadataAsString() {
+        if (metadata == null) {
+            return "";
+        }
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < metadata.length; i++) {
+            sb.append(metadata[i]);
+            if (i < metadata.length-1) {
+                sb.append(",");
+            }
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Returns an array of {@see CTAKESAnnotationProperty}'s that will be 
included into cTAKES metadata.
+     * @return an array of {@see CTAKESAnnotationProperty}'s that will be 
included into cTAKES metadata.
+     */
+    public CTAKESAnnotationProperty[] getAnnotationProps() {
+        return annotationProps;
+    }
+
+    /**
+     * Returns a string containing a comma-separated list of {@see 
CTAKESAnnotationProperty} names that will be included into cTAKES metadata.
+     * @return
+     */
+    public String getAnnotationPropsAsString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("coveredText");
+        if (annotationProps != null) {
+            for (CTAKESAnnotationProperty property : annotationProps) {
+                sb.append(separatorChar);
+                sb.append(property.getName());
+            }
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Returns the separator character used for annotation properties.
+     * @return the separator character used for annotation properties.
+     */
+    public char getSeparatorChar() {
+        return separatorChar;
+    }
+
+    /**
+     * Sets the path to XML descriptor for AnalysisEngine.
+     * @param aeDescriptorPath the path to XML descriptor for AnalysisEngine.
+     */
+    public void setAeDescriptorPath(String aeDescriptorPath) {
+        this.aeDescriptorPath = aeDescriptorPath;
+    }
+
+    /**
+     * Sets the UMLS username.
+     * @param uMLSUser the UMLS username.
+     */
+    public void setUMLSUser(String uMLSUser) {
+        this.UMLSUser = uMLSUser;
+    }
+
+    /**
+     * Sets the UMLS password.
+     * @param uMLSPass the UMLS password.
+     */
+    public void setUMLSPass(String uMLSPass) {
+        this.UMLSPass = uMLSPass;
+    }
+
+    /**
+     * Enables the formatted output for serializer.
+     * @param prettyPrint {@true} to enable formatted output, {@code false} 
otherwise.
+     */
+    public void setPrettyPrint(boolean prettyPrint) {
+        this.prettyPrint = prettyPrint;
+    }
+
+    /**
+     * Sets the type of cTAKES (UIMA) serializer used to write CAS. 
+     * @param serializerType the type of cTAKES serializer.
+     */
+    public void setSerializerType(CTAKESSerializer serializerType) {
+        this.serializerType = serializerType;
+    }
+
+    /**
+     * Sets the {@see OutputStream} object used to write the CAS.
+     * @param stream the {@see OutputStream} object used to write the CAS.
+     */
+    public void setOutputStream(OutputStream stream) {
+        this.stream = stream;
+    }
+
+    /**
+     * Enables CAS serialization.
+     * @param serialize {@true} to enable CAS serialization, {@code false} 
otherwise.
+     */
+    public void setSerialize(boolean serialize) {
+        this.serialize = serialize;
+    }
+
+    /**
+     * Enables content text analysis using cTAKES.
+     * @param text {@true} to enable content text analysis, {@code false} 
otherwise.
+     */
+    public void setText(boolean text) {
+        this.text = text;
+    }
+
+    /**
+     * Sets the metadata whose values will be analyzed using cTAKES.
+     * @param metadata the metadata whose values will be analyzed using cTAKES.
+     */
+    public void setMetadata(String[] metadata) {
+        this.metadata = metadata;
+    }
+
+    /**
+     * Sets the {@see CTAKESAnnotationProperty}'s that will be included into 
cTAKES metadata.
+     * @param annotationProps the {@see CTAKESAnnotationProperty}'s that will 
be included into cTAKES metadata.
+     */
+    public void setAnnotationProps(CTAKESAnnotationProperty[] annotationProps) 
{
+        this.annotationProps = annotationProps;
+    }
+
+    /**
+     * ets the {@see CTAKESAnnotationProperty}'s that will be included into 
cTAKES metadata.
+     * @param annotationProps the {@see CTAKESAnnotationProperty}'s that will 
be included into cTAKES metadata.
+     */
+    public void setAnnotationProps(String[] annotationProps) {
+        CTAKESAnnotationProperty[] properties = new 
CTAKESAnnotationProperty[annotationProps.length];
+        for (int i = 0; i < annotationProps.length; i++) {
+            properties[i] = 
CTAKESAnnotationProperty.valueOf(annotationProps[i]);
+        }
+        setAnnotationProps(properties);
+    }
+
+    /**
+     * Sets the separator character used for annotation properties.
+     * @param separatorChar the separator character used for annotation 
properties.
+     */
+    public void setSeparatorChar(char separatorChar) {
+        this.separatorChar = separatorChar;
+    }
+}
\ No newline at end of file

svn commit: r1725014 [21/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-m...

Reply via email to