Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + + +import static org.junit.Assert.assertTrue; + +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PropertyTypeException; +import org.junit.Test; + +public class AccessCheckerTest { + + @Test + public void testLegacy() throws AccessPermissionException { + + Metadata m = getMetadata(false, false); + //legacy behavior; don't bother checking + AccessChecker checker = new AccessChecker(); + checker.check(m); + assertTrue("no exception", true); + + m = getMetadata(false, true); + assertTrue("no exception", true); + checker.check(m); + + m = getMetadata(true, true); + assertTrue("no exception", true); + checker.check(m); + } + + @Test + public void testNoExtraction() { + + Metadata m = null; + //allow nothing + AccessChecker checker = new AccessChecker(false); + boolean ex = false; + try { + m = getMetadata(false, false); + checker.check(m); + } catch (AccessPermissionException e) { + ex = true; + } + assertTrue("correct exception with no extraction, no extract for accessibility", ex); + ex = false; + try { + //document allows extraction for accessibility + m = getMetadata(false, true); + checker.check(m); + } catch (AccessPermissionException e) { + //but application is not an accessibility application + ex = true; + } + assertTrue("correct exception with no extraction, no extract for accessibility", ex); + } + + @Test + public void testExtractOnlyForAccessibility() throws AccessPermissionException { + Metadata m = getMetadata(false, true); + //allow accessibility + AccessChecker checker = new AccessChecker(true); + checker.check(m); + assertTrue("no exception", true); + boolean ex = false; + try { + m = getMetadata(false, false); + checker.check(m); + } catch (AccessPermissionException e) { + ex = true; + } + assertTrue("correct exception", ex); + } + + @Test + public void testCrazyExtractNotForAccessibility() throws AccessPermissionException { + Metadata m = getMetadata(true, false); + //allow accessibility + AccessChecker checker = new AccessChecker(true); + checker.check(m); + assertTrue("no exception", true); + + //don't extract for accessibility + checker = new AccessChecker(false); + //if extract content is allowed, the checker shouldn't + //check the value of extract for accessibility + checker.check(m); + assertTrue("no exception", true); + + } + + @Test + public void testCantAddMultiplesToMetadata() { + Metadata m = new Metadata(); + boolean ex = false; + m.add(AccessPermissions.EXTRACT_CONTENT, "true"); + try { + m.add(AccessPermissions.EXTRACT_CONTENT, "false"); + } catch (PropertyTypeException e) { + ex = true; + } + assertTrue("can't add multiple values", ex); + + m = new Metadata(); + ex = false; + m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true"); + try { + m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false"); + } catch (PropertyTypeException e) { + ex = true; + } + assertTrue("can't add multiple values", ex); + } + + private Metadata getMetadata(boolean allowExtraction, boolean allowExtractionForAccessibility) { + Metadata m = new Metadata(); + m.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(allowExtraction)); + m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(allowExtractionForAccessibility)); + return m; + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,1377 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.tika.TikaTest; +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.DocumentSelector; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ContentHandlerDecorator; +import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing pdf files. + */ +public class PDFParserTest extends TikaTest { + + public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN; + public static final MediaType TYPE_EMF = MediaType.application("x-emf"); + public static final MediaType TYPE_PDF = MediaType.application("pdf"); + public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"); + public static final MediaType TYPE_DOC = MediaType.application("msword"); + public static Level PDFBOX_LOG_LEVEL = Level.INFO; + + @BeforeClass + public static void setup() { + //remember default logging level, but turn off for PDFParserTest + PDFBOX_LOG_LEVEL = Logger.getLogger("org.apache.pdfbox").getLevel(); + Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF); + } + + @AfterClass + public static void tearDown() { + //return to regular logging level + Logger.getLogger("org.apache.pdfbox").setLevel(PDFBOX_LOG_LEVEL); + } + + private static int substringCount(String needle, String haystack) { + int upto = -1; + int count = 0; + while (true) { + final int next = haystack.indexOf(needle, upto); + if (next == -1) { + break; + } + count++; + upto = next + 1; + } + + return count; + } + + @Test + public void testPdfParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + Metadata metadata = new Metadata(); + + InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF.pdf"); + + String content = getText(stream, parser, metadata); + + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR)); + assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL)); + assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE)); + + // Can't reliably test dates yet - see TIKA-451 +// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE)); +// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED)); + + assertContains("Apache Tika", content); + assertContains("Tika - Content Analysis Toolkit", content); + assertContains("incubator", content); + assertContains("Apache Software Foundation", content); + // testing how the end of one paragraph is separated from start of the next one + assertTrue("should have word boundary after headline", + !content.contains("ToolkitApache")); + assertTrue("should have word boundary between paragraphs", + !content.contains("libraries.Apache")); + } + + @Test + public void testPdfParsingMetadataOnly() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + Metadata metadata = new Metadata(); + + try (InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF.pdf")) { + parser.parse(stream, null, metadata, new ParseContext()); + } + + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL)); + assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE)); + } + + @Test + public void testCustomMetadata() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + Metadata metadata = new Metadata(); + + InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF-custommetadata.pdf"); + + String content = getText(stream, parser, metadata); + + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Document author", metadata.get(Metadata.AUTHOR)); + assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE)); + + assertEquals("Custom Value", metadata.get("Custom Property")); + + assertEquals("Array Entry 1", metadata.get("Custom Array")); + assertEquals(2, metadata.getValues("Custom Array").length); + assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]); + assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]); + + assertContains("Hello World!", content); + } + + /** + * PDFs can be "protected" with the default password. This means + * they're encrypted (potentially both text and metadata), + * but we can decrypt them easily. + */ + @Test + public void testProtectedPDF() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + + try (InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF_protected.pdf")) { + parser.parse(stream, handler, metadata, context); + } + + assertEquals("true", metadata.get("pdf:encrypted")); + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR)); + assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); + assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); + + String content = handler.toString(); + assertContains("RETHINKING THE FINANCIAL NETWORK", content); + assertContains("On 16 November 2002", content); + assertContains("In many important respects", content); + + + // Try again with an explicit empty password + handler = new BodyContentHandler(); + metadata = new Metadata(); + + context = new ParseContext(); + context.set(PasswordProvider.class, new PasswordProvider() { + public String getPassword(Metadata metadata) { + return ""; + } + }); + + try (InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF_protected.pdf")) { + parser.parse(stream, handler, metadata, context); + } + assertEquals("true", metadata.get("pdf:encrypted")); + + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT)); + assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE)); + + assertContains("RETHINKING THE FINANCIAL NETWORK", content); + assertContains("On 16 November 2002", content); + assertContains("In many important respects", content); + + //now test wrong password + handler = new BodyContentHandler(); + metadata = new Metadata(); + context = new ParseContext(); + context.set(PasswordProvider.class, new PasswordProvider() { + public String getPassword(Metadata metadata) { + return "WRONG!!!!"; + } + }); + + boolean ex = false; + try (InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF_protected.pdf")) { + parser.parse(stream, handler, metadata, context); + } catch (EncryptedDocumentException e) { + ex = true; + } + content = handler.toString(); + + assertTrue("encryption exception", ex); + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("true", metadata.get("pdf:encrypted")); + //pdf:encrypted, X-Parsed-By and Content-Type + assertEquals("very little metadata should be parsed", 3, metadata.names().length); + assertEquals(0, content.length()); + + //now test wrong password with non sequential parser + handler = new BodyContentHandler(); + metadata = new Metadata(); + context = new ParseContext(); + context.set(PasswordProvider.class, new PasswordProvider() { + public String getPassword(Metadata metadata) { + return "WRONG!!!!"; + } + }); + PDFParserConfig config = new PDFParserConfig(); + config.setUseNonSequentialParser(true); + context.set(PDFParserConfig.class, config); + + ; + ex = false; + try (InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF_protected.pdf")) { + parser.parse(stream, handler, metadata, context); + } catch (EncryptedDocumentException e) { + ex = true; + } + content = handler.toString(); + assertTrue("encryption exception", ex); + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("true", metadata.get("pdf:encrypted")); + + //pdf:encrypted, X-Parsed-By and Content-Type + assertEquals("very little metadata should be parsed", 3, metadata.names().length); + assertEquals(0, content.length()); + } + + @Test + public void testTwoTextBoxes() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDFTwoTextBoxes.pdf"); + String content = getText(stream, parser); + content = content.replaceAll("\\s+", " "); + assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content); + } + + @Test + public void testVarious() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + Metadata metadata = new Metadata(); + InputStream stream = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDFVarious.pdf"); + + String content = getText(stream, parser, metadata); + //content = content.replaceAll("\\s+"," "); + assertContains("Footnote appears here", content); + assertContains("This is a footnote.", content); + assertContains("This is the header text.", content); + assertContains("This is the footer text.", content); + assertContains("Here is a text box", content); + assertContains("Bold", content); + assertContains("italic", content); + assertContains("underline", content); + assertContains("superscript", content); + assertContains("subscript", content); + assertContains("Here is a citation:", content); + assertContains("Figure 1 This is a caption for Figure 1", content); + assertContains("(Kramer)", content); + assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," ")); + assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," ")); + assertContains("This is a hyperlink", content); + assertContains("Here is a list:", content); + for(int row=1;row<=3;row++) { + //assertContains("·\tBullet " + row, content); + //assertContains("\u00b7\tBullet " + row, content); + assertContains("Bullet " + row, content); + } + assertContains("Here is a numbered list:", content); + for(int row=1;row<=3;row++) { + //assertContains(row + ")\tNumber bullet " + row, content); + assertContains(row + ") Number bullet " + row, content); + } + + for(int row=1;row<=2;row++) { + for(int col=1;col<=3;col++) { + assertContains("Row " + row + " Col " + col, content); + } + } + + assertContains("Keyword1 Keyword2", content); + assertEquals("Keyword1 Keyword2", + metadata.get(Metadata.KEYWORDS)); + + assertContains("Subject is here", content); + assertEquals("Subject is here", + metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Subject is here", + metadata.get(Metadata.SUBJECT)); + + assertContains("Suddenly some Japanese text:", content); + // Special version of (GHQ) + assertContains("\uff08\uff27\uff28\uff31\uff09", content); + // 6 other characters + assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content); + + assertContains("And then some Gothic text:", content); + // TODO: I saved the word doc as a PDF, but that + // process somehow, apparently lost the gothic + // chars, so we cannot test this here: + //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); + } + + @Test + public void testAnnotations() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf"); + String content = getText(stream, parser); + content = content.replaceAll("[\\s\u00a0]+", " "); + assertContains("Here is some text", content); + assertContains("Here is a comment", content); + + // Test w/ annotation text disabled: + PDFParser pdfParser = new PDFParser(); + pdfParser.getPDFParserConfig().setExtractAnnotationText(false); + stream = getResourceAsStream("/test-documents/testAnnotations.pdf"); + content = getText(stream, pdfParser); + content = content.replaceAll("[\\s\u00a0]+", " "); + assertContains("Here is some text", content); + assertEquals(-1, content.indexOf("Here is a comment")); + + // annotation text disabled through parsecontext + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + config.setExtractAnnotationText(false); + context.set(PDFParserConfig.class, config); + stream = getResourceAsStream("/test-documents/testAnnotations.pdf"); + content = getText(stream, parser, context); + content = content.replaceAll("[\\s\u00a0]+", " "); + assertContains("Here is some text", content); + assertEquals(-1, content.indexOf("Here is a comment")); + + + // TIKA-738: make sure no extra </p> tags + String xml = getXML("testAnnotations.pdf").xml; + assertEquals(substringCount("<p>", xml), + substringCount("</p>", xml)); + } + + // TIKA-981 + @Test + public void testPopupAnnotation() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf"); + String content = getText(stream, parser); + assertContains("this is the note", content); + assertContains("igalsh", content); + } + + @Test + public void testEmbeddedPDFs() throws Exception { + String xml = getXML("testPDFPackage.pdf").xml; + assertContains("PDF1", xml); + assertContains("PDF2", xml); + } + + @Test + public void testPageNumber() throws Exception { + final XMLResult result = getXML("testPageNumber.pdf"); + final String content = result.xml.replaceAll("\\s+", ""); + assertContains("<p>1</p>", content); + } + + /** + * Test to ensure that Links are extracted from the text + * <p/> + * Note - the PDF contains the text "This is a hyperlink" which + * a hyperlink annotation, linking to the tika site, on it. This + * test will need updating when we're able to apply the annotation + * to the text itself, rather than following on afterwards as now + */ + @Test + public void testLinks() throws Exception { + final XMLResult result = getXML("testPDFVarious.pdf"); + assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml); + } + + @Test + public void testDisableAutoSpace() throws Exception { + PDFParser parser = new PDFParser(); + parser.getPDFParserConfig().setEnableAutoSpace(false); + InputStream stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf"); + String content = getText(stream, parser); + content = content.replaceAll("[\\s\u00a0]+", " "); + // Text is correct when autoSpace is off: + assertContains("Here is some formatted text", content); + + parser.getPDFParserConfig().setEnableAutoSpace(true); + stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf"); + content = getText(stream, parser); + content = content.replaceAll("[\\s\u00a0]+", " "); + // Text is correct when autoSpace is off: + + // Text has extra spaces when autoSpace is on + assertEquals(-1, content.indexOf("Here is some formatted text")); + + //now try with autodetect + Parser autoParser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + context.set(PDFParserConfig.class, config); + //default is true + stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf"); + content = getText(stream, autoParser, context); + content = content.replaceAll("[\\s\u00a0]+", " "); + // Text has extra spaces when autoSpace is on + assertEquals(-1, content.indexOf("Here is some formatted text")); + + config.setEnableAutoSpace(false); + + stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf"); + content = getText(stream, parser, context); + content = content.replaceAll("[\\s\u00a0]+", " "); + // Text is correct when autoSpace is off: + assertContains("Here is some formatted text", content); + + } + + @Test + public void testDuplicateOverlappingText() throws Exception { + PDFParser parser = new PDFParser(); + InputStream stream = getResourceAsStream("/test-documents/testOverlappingText.pdf"); + // Default is false (keep overlapping text): + String content = getText(stream, parser); + assertContains("Text the first timeText the second time", content); + + parser.getPDFParserConfig().setSuppressDuplicateOverlappingText(true); + stream = getResourceAsStream("/test-documents/testOverlappingText.pdf"); + content = getText(stream, parser); + // "Text the first" was dedup'd: + assertContains("Text the first timesecond time", content); + + //now try with autodetect + Parser autoParser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + context.set(PDFParserConfig.class, config); + stream = getResourceAsStream("/test-documents/testOverlappingText.pdf"); + // Default is false (keep overlapping text): + content = getText(stream, autoParser, context); + assertContains("Text the first timeText the second time", content); + + config.setSuppressDuplicateOverlappingText(true); + stream = getResourceAsStream("/test-documents/testOverlappingText.pdf"); + content = getText(stream, autoParser, context); + // "Text the first" was dedup'd: + assertContains("Text the first timesecond time", content); + + } + + @Test + public void testSortByPosition() throws Exception { + PDFParser parser = new PDFParser(); + parser.getPDFParserConfig().setEnableAutoSpace(false); + InputStream stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"); + // Default is false (do not sort): + String content = getText(stream, parser); + content = content.replaceAll("\\s+", " "); + assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content); + + parser.getPDFParserConfig().setSortByPosition(true); + stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"); + content = getText(stream, parser); + content = content.replaceAll("\\s+", " "); + // Column text is now interleaved: + assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content); + + //now try setting autodetect via parsecontext + AutoDetectParser autoParser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + context.set(PDFParserConfig.class, config); + stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"); + // Default is false (do not sort): + content = getText(stream, autoParser, context); + content = content.replaceAll("\\s+", " "); + assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content); + + config.setSortByPosition(true); + context.set(PDFParserConfig.class, config); + stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"); + content = getText(stream, parser); + content = content.replaceAll("\\s+", " "); + // Column text is now interleaved: + assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content); + + } + + // TIKA-1035 + @Test + public void testBookmarks() throws Exception { + String xml = getXML("testPDF_bookmarks.pdf").xml; + int i = xml.indexOf("Denmark bookmark is here"); + int j = xml.indexOf("</body>"); + assertTrue(i != -1); + assertTrue(j != -1); + assertTrue(i < j); + } + + //TIKA-1124 + @Test + public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception { + /* format of test doc: + docx/ + pdf/ + docx + */ + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + String content = ""; + InputStream stream = null; + try { + context.set(org.apache.tika.parser.Parser.class, parser); + stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"); + parser.parse(stream, handler, metadata, context); + content = handler.toString(); + } finally { + stream.close(); + } + int outerHaystack = content.indexOf("Outer_haystack"); + int pdfHaystack = content.indexOf("pdf_haystack"); + int needle = content.indexOf("Needle"); + assertTrue(outerHaystack > -1); + assertTrue(pdfHaystack > -1); + assertTrue(needle > -1); + assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack); + + TrackingHandler tracker = new TrackingHandler(); + TikaInputStream tis; + ContainerExtractor ex = new ParserContainerExtractor(); + try { + tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx")); + ex.extract(tis, ex, tracker); + } finally { + stream.close(); + } + assertEquals(true, ex.isSupported(tis)); + assertEquals(3, tracker.filenames.size()); + assertEquals(3, tracker.mediaTypes.size()); + assertEquals("image1.emf", tracker.filenames.get(0)); + assertNull(tracker.filenames.get(1)); + assertEquals("Test.docx", tracker.filenames.get(2)); + assertEquals(TYPE_EMF, tracker.mediaTypes.get(0)); + assertEquals(TYPE_PDF, tracker.mediaTypes.get(1)); + assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2)); + } + + /** + * tests for equality between traditional sequential parser + * and newer nonsequential parser. + * <p/> + * TODO: more testing + */ + @Test + public void testSequentialParser() throws Exception { + + Parser sequentialParser = new AutoDetectParser(); + Parser nonSequentialParser = new AutoDetectParser(); + + ParseContext seqContext = new ParseContext(); + PDFParserConfig seqConfig = new PDFParserConfig(); + seqConfig.setUseNonSequentialParser(false); + seqContext.set(PDFParserConfig.class, seqConfig); + + ParseContext nonSeqContext = new ParseContext(); + PDFParserConfig nonSeqConfig = new PDFParserConfig(); + nonSeqConfig.setUseNonSequentialParser(true); + nonSeqContext.set(PDFParserConfig.class, nonSeqConfig); + + File testDocs = new File(this.getClass().getResource("/test-documents").toURI()); + int pdfs = 0; + Set<String> knownMetadataDiffs = new HashSet<String>(); + //PDFBox-1792/Tika-1203 + knownMetadataDiffs.add("testAnnotations.pdf"); + // Added for TIKA-93. + knownMetadataDiffs.add("testOCR.pdf"); + // Added for TIKA-1085 + knownMetadataDiffs.add("testPDF_bom.pdf"); + + //empty for now + Set<String> knownContentDiffs = new HashSet<String>(); + + for (File f : testDocs.listFiles()) { + if (!f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) { + continue; + } + + String sequentialContent = null; + Metadata sequentialMetadata = new Metadata(); + try { + sequentialContent = getText(new FileInputStream(f), + sequentialParser, seqContext, sequentialMetadata); + } catch (EncryptedDocumentException e) { + //silently skip a file that requires a user password + continue; + } catch (Exception e) { + throw new TikaException("Sequential Parser failed on test file " + f, e); + } + + pdfs++; + + String nonSequentialContent = null; + Metadata nonSequentialMetadata = new Metadata(); + try { + nonSequentialContent = getText(new FileInputStream(f), + nonSequentialParser, nonSeqContext, nonSequentialMetadata); + } catch (Exception e) { + throw new TikaException("Non-Sequential Parser failed on test file " + f, e); + } + + if (knownContentDiffs.contains(f.getName())) { + assertFalse(f.getName(), sequentialContent.equals(nonSequentialContent)); + } else { + assertEquals(f.getName(), sequentialContent, nonSequentialContent); + } + + //skip this one file. + if (knownMetadataDiffs.contains(f.getName())) { + assertFalse(f.getName(), sequentialMetadata.equals(nonSequentialMetadata)); + } else { + assertEquals(f.getName(), sequentialMetadata, nonSequentialMetadata); + } + } + //make sure nothing went wrong with getting the resource to test-documents + //must have tested >= 15 pdfs + boolean ge15 = (pdfs >= 15); + assertTrue("Number of pdf files tested >= 15 in non-sequential parser test", ge15); + } + + + // TIKA-973 + //commented out until test documents that are unambiguously + //consistent with Apache License v2.0 are contributed. + //TODO: add back test for AcroForm extraction; test document should include + //recursive forms +/* public void testAcroForm() throws Exception{ + Parser p = new AutoDetectParser(); + ParseContext context = new ParseContext(); + InputStream stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf"); + String txt = getText(stream, p, context); + stream.close(); + + //simple first level form contents + assertContains("to: John Doe", txt); + //checkbox + assertContains("xpackaging: Yes", txt); + + //this guarantees that the form processor + //worked recursively at least once...i.e. it didn't just + //take the first form + stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf"); + txt = getText(stream, p, context); + stream.close(); + assertContains("123 Main St.", txt); + + + //now test with nonsequential parser + PDFParserConfig config = new PDFParserConfig(); + config.setUseNonSequentialParser(true); + context.set(PDFParserConfig.class, config); + stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf"); + txt = getText(stream, p, context); + stream.close(); + + //simple first level form contents + assertContains("to: John Doe", txt); + //checkbox + assertContains("xpackaging: Yes", txt); + + //this guarantees that the form processor + //worked recursively at least once...i.e. it didn't just + //take the first form + stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf"); + txt = getText(stream, p, context); + assertContains("123 Main St.", txt); + stream.close(); + } +*/ + + //TIKA-1226 + @Test + public void testSignatureInAcroForm() throws Exception { + //The current test doc does not contain any content in the signature area. + //This just tests that a RuntimeException is not thrown. + //TODO: find a better test file for this issue. + String xml = getXML("/testPDF_acroform3.pdf").xml; + assertTrue("found", (xml.contains("<li>aTextField: TIKA-1226</li>"))); + } + + @Test // TIKA-1228, TIKA-1268 + public void testEmbeddedFilesInChildren() throws Exception { + String xml = getXML("/testPDF_childAttachments.pdf").xml; + //"regressiveness" exists only in Unit10.doc not in the container pdf document + assertTrue(xml.contains("regressiveness")); + + RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + config.setExtractUniqueInlineImagesOnly(false); + context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); + context.set(org.apache.tika.parser.Parser.class, p); + + try (TikaInputStream tis = TikaInputStream.get( + getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) { + p.parse(tis, new BodyContentHandler(-1), new Metadata(), context); + } + + List<Metadata> metadatas = p.getMetadata(); + + assertEquals(5, metadatas.size()); + assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE)); + assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE)); + assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE)); + } + + + @Test + public void testEmbeddedFilesInAnnotations() throws Exception { + String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml; + + assertTrue(xml.contains("This is a Excel")); + } + + @Test + public void testSingleCloseDoc() throws Exception { + //TIKA-1341 + InputStream is = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDFTripleLangTitle.pdf"); + Parser p = new AutoDetectParser(); + Metadata m = new Metadata(); + ParseContext c = new ParseContext(); + ContentHandler h = new EventCountingHandler(); + p.parse(is, h, m, c); + assertEquals(1, ((EventCountingHandler) h).getEndDocument()); + } + + @Test + public void testVersions() throws Exception { + + Map<String, String> dcFormat = new HashMap<String, String>(); + dcFormat.put("4.x", "application/pdf; version=1.3"); + dcFormat.put("5.x", "application/pdf; version=1.4"); + dcFormat.put("6.x", "application/pdf; version=1.5"); + dcFormat.put("7.x", "application/pdf; version=1.6"); + dcFormat.put("8.x", "application/pdf; version=1.7"); + dcFormat.put("9.x", "application/pdf; version=1.7"); + dcFormat.put("10.x", "application/pdf; version=1.7"); + dcFormat.put("11.x.PDFA-1b", "application/pdf; version=1.7"); + + Map<String, String> pdfVersions = new HashMap<String, String>(); + pdfVersions.put("4.x", "1.3"); + pdfVersions.put("5.x", "1.4"); + pdfVersions.put("6.x", "1.5"); + pdfVersions.put("7.x", "1.6"); + pdfVersions.put("8.x", "1.7"); + pdfVersions.put("9.x", "1.7"); + pdfVersions.put("10.x", "1.7"); + pdfVersions.put("11.x.PDFA-1b", "1.7"); + + Map<String, String> pdfExtensionVersions = new HashMap<String, String>(); + pdfExtensionVersions.put("9.x", "1.7 Adobe Extension Level 3"); + pdfExtensionVersions.put("10.x", "1.7 Adobe Extension Level 8"); + pdfExtensionVersions.put("11.x.PDFA-1b", "1.7 Adobe Extension Level 8"); + + Parser p = new AutoDetectParser(); + for (Map.Entry<String, String> e : dcFormat.entrySet()) { + String fName = "testPDF_Version." + e.getKey() + ".pdf"; + InputStream is = PDFParserTest.class.getResourceAsStream( + "/test-documents/" + fName); + Metadata m = new Metadata(); + ContentHandler h = new BodyContentHandler(); + ParseContext c = new ParseContext(); + p.parse(is, h, m, c); + is.close(); + boolean foundDC = false; + String[] vals = m.getValues("dc:format"); + for (String v : vals) { + if (v.equals(e.getValue())) { + foundDC = true; + } + } + assertTrue("dc:format ::" + e.getValue(), foundDC); + String extensionVersionTruth = pdfExtensionVersions.get(e.getKey()); + if (extensionVersionTruth != null) { + assertEquals("pdf:PDFExtensionVersion :: " + extensionVersionTruth, + extensionVersionTruth, + m.get("pdf:PDFExtensionVersion")); + } + assertEquals("pdf:PDFVersion", pdfVersions.get(e.getKey()), + m.get("pdf:PDFVersion")); + } + //now test full 11.x + String fName = "testPDF_Version.11.x.PDFA-1b.pdf"; + InputStream is = PDFParserTest.class.getResourceAsStream( + "/test-documents/" + fName); + Metadata m = new Metadata(); + ParseContext c = new ParseContext(); + ContentHandler h = new BodyContentHandler(); + p.parse(is, h, m, c); + is.close(); + Set<String> versions = new HashSet<String>(); + for (String fmt : m.getValues("dc:format")) { + versions.add(fmt); + } + + for (String hit : new String[]{"application/pdf; version=1.7", + "application/pdf; version=\"A-1b\"", + "application/pdf; version=\"1.7 Adobe Extension Level 8\"" + }) { + assertTrue(hit, versions.contains(hit)); + } + + assertEquals("pdfaid:conformance", m.get("pdfaid:conformance"), "B"); + assertEquals("pdfaid:part", m.get("pdfaid:part"), "1"); + } + + @Test + public void testMultipleAuthors() throws Exception { + String fName = "testPDF_twoAuthors.pdf"; + InputStream is = PDFParserTest.class.getResourceAsStream( + "/test-documents/" + fName); + Parser p = new AutoDetectParser(); + Metadata m = new Metadata(); + ParseContext c = new ParseContext(); + ContentHandler h = new BodyContentHandler(); + p.parse(is, h, m, c); + is.close(); + + String[] keys = new String[]{ + "dc:creator", + "meta:author", + "creator", + "Author" + }; + + for (String k : keys) { + String[] vals = m.getValues(k); + assertEquals("number of authors == 2 for key: " + k, 2, vals.length); + Set<String> set = new HashSet<String>(); + set.add(vals[0]); + set.add(vals[1]); + assertTrue("Sample Author 1", set.contains("Sample Author 1")); + assertTrue("Sample Author 2", set.contains("Sample Author 2")); + } + } + + //STUB test for once TIKA-1295 is fixed + @Test + public void testMultipleTitles() throws Exception { + InputStream is = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDFTripleLangTitle.pdf"); + Parser p = new AutoDetectParser(); + Metadata m = new Metadata(); + ParseContext c = new ParseContext(); + ContentHandler h = new BodyContentHandler(); + p.parse(is, h, m, c); + is.close(); + //TODO: add other tests as part of TIKA-1295 + //dc:title-fr-ca (or whatever we decide) should be "Bonjour World" + //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting xmp? + // + assertEquals("Hello World", m.get("dc:title")); + } + + @Test + public void testInlineSelector() throws Exception { + + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + config.setExtractUniqueInlineImagesOnly(false); + + Parser defaultParser = new AutoDetectParser(); + + RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); + ParseContext context = new ParseContext(); + context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); + context.set(org.apache.tika.parser.Parser.class, p); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(-1); + String path = "/test-documents/testPDF_childAttachments.pdf"; + InputStream stream = TikaInputStream.get(this.getClass().getResource(path)); + + p.parse(stream, handler, metadata, context); + + List<Metadata> metadatas = p.getMetadata(); + int inline = 0; + int attach = 0; + for (Metadata m : metadatas) { + String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null) { + if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { + inline++; + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) { + attach++; + } + } + } + assertEquals(2, inline); + assertEquals(2, attach); + + stream.close(); + p.reset(); + + //now try turning off inline + stream = TikaInputStream.get(this.getClass().getResource(path)); + + context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector()); + inline = 0; + attach = 0; + handler = new BodyContentHandler(-1); + metadata = new Metadata(); + p.parse(stream, handler, metadata, context); + + metadatas = p.getMetadata(); + for (Metadata m : metadatas) { + String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null) { + if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { + inline++; + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) { + attach++; + } + } + } + assertEquals(0, inline); + assertEquals(2, attach); + + } + + + @Test + public void testInlineConfig() throws Exception { + + Parser defaultParser = new AutoDetectParser(); + RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); + ParseContext context = new ParseContext(); + context.set(org.apache.tika.parser.Parser.class, p); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(-1); + String path = "/test-documents/testPDF_childAttachments.pdf"; + InputStream stream = TikaInputStream.get(this.getClass().getResource(path)); + + p.parse(stream, handler, metadata, context); + + List<Metadata> metadatas = p.getMetadata(); + int inline = 0; + int attach = 0; + for (Metadata m : metadatas) { + String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null) { + if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { + inline++; + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) { + attach++; + } + } + } + assertEquals(0, inline); + assertEquals(2, attach); + + stream.close(); + p.reset(); + + //now try turning off inline + stream = TikaInputStream.get(this.getClass().getResource(path)); + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + config.setExtractUniqueInlineImagesOnly(false); + + context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); + inline = 0; + attach = 0; + handler = new BodyContentHandler(-1); + metadata = new Metadata(); + p.parse(stream, handler, metadata, context); + + metadatas = p.getMetadata(); + for (Metadata m : metadatas) { + String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null) { + if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { + inline++; + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) { + attach++; + } + } + } + assertEquals(2, inline); + assertEquals(2, attach); + } + + @Test //TIKA-1376 + public void testEmbeddedFileNameExtraction() throws Exception { + InputStream is = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF_multiFormatEmbFiles.pdf"); + RecursiveParserWrapper p = new RecursiveParserWrapper( + new AutoDetectParser(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); + Metadata m = new Metadata(); + ParseContext c = new ParseContext(); + c.set(org.apache.tika.parser.Parser.class, p); + ContentHandler h = new BodyContentHandler(); + p.parse(is, h, m, c); + is.close(); + List<Metadata> metadatas = p.getMetadata(); + assertEquals("metadata size", 5, metadatas.size()); + Metadata firstAttachment = metadatas.get(1); + assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY)); + } + + @Test //TIKA-1374 + public void testOSSpecificEmbeddedFileExtraction() throws Exception { + InputStream is = PDFParserTest.class.getResourceAsStream( + "/test-documents/testPDF_multiFormatEmbFiles.pdf"); + RecursiveParserWrapper p = new RecursiveParserWrapper( + new AutoDetectParser(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + Metadata m = new Metadata(); + ParseContext c = new ParseContext(); + c.set(org.apache.tika.parser.Parser.class, p); + ContentHandler h = new BodyContentHandler(); + p.parse(is, h, m, c); + is.close(); + List<Metadata> metadatas = p.getMetadata(); + assertEquals("metadata size", 5, metadatas.size()); + + assertEquals("file name", "Test.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY)); + assertContains("os specific", metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("file name", "TestMac.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY)); + assertContains("mac embedded", metadatas.get(2).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("file name", "TestDos.txt", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY)); + assertContains("dos embedded", metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY)); + assertContains("unix embedded", metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT)); + + } + + @Test //TIKA-1427 + public void testEmbeddedFileMarkup() throws Exception { + Parser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + context.set(org.apache.tika.parser.Parser.class, parser); + + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + config.setExtractUniqueInlineImagesOnly(false); + context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); + + + Metadata metadata = new Metadata(); + ContentHandler handler = new ToXMLContentHandler(); + String path = "/test-documents/testPDF_childAttachments.pdf"; + InputStream stream = null; + try { + stream = TikaInputStream.get(this.getClass().getResource(path)); + parser.parse(stream, handler, metadata, context); + } finally { + IOUtils.closeQuietly(stream); + } + + String xml = handler.toString(); + //regular attachment + assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml); + //inline image + assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml); + + //doc embedded inside an annotation + xml = getXML("testPDFFileEmbInAnnotation.pdf").xml; + assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml); + } + + //Access checker tests + + @Test + public void testLegacyAccessChecking() throws Exception { + //test that default behavior doesn't throw AccessPermissionException + for (String file : new String[]{ + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + String xml = getXML(file).xml; + assertContains("Hello World", xml); + } + + //now try with the user password + PasswordProvider provider = new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "user"; + } + }; + + ParseContext context = new ParseContext(); + context.set(PasswordProvider.class, provider); + Parser parser = new AutoDetectParser(); + + for (String path : new String[]{ + "testPDF_no_extract_no_accessibility_owner_user.pdf", + "testPDF_no_extract_yes_accessibility_owner_user.pdf", + }) { + InputStream stream = null; + try { + stream = TikaInputStream.get(this.getClass().getResource("/test-documents/" + path)); + String text = getText(stream, parser, context); + assertContains("Hello World", text); + } finally { + IOUtils.closeQuietly(stream); + } + } + } + + @Test + public void testAccessCheckingEmptyPassword() throws Exception { + PDFParserConfig config = new PDFParserConfig(); + + //don't allow extraction, not even for accessibility + config.setAccessChecker(new AccessChecker(false)); + Parser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + + //test exception for empty password + for (String path : new String[]{ + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + assertException("/test-documents/" + path, parser, context, AccessPermissionException.class); + } + + config.setAccessChecker(new AccessChecker(true)); + assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + parser, context, AccessPermissionException.class); + + InputStream is = null; + try { + is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_empty.pdf"); + assertContains("Hello World", getText(is, parser, context)); + } finally { + IOUtils.closeQuietly(is); + } + } + + @Test + public void testAccessCheckingUserPassword() throws Exception { + ParseContext context = new ParseContext(); + + PDFParserConfig config = new PDFParserConfig(); + //don't allow extraction, not even for accessibility + config.setAccessChecker(new AccessChecker(false)); + PasswordProvider passwordProvider = new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "user"; + } + }; + + context.set(PasswordProvider.class, passwordProvider); + context.set(PDFParserConfig.class, config); + + Parser parser = new AutoDetectParser(); + + //test bad passwords + for (String path : new String[]{ + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class); + } + + //bad password is still a bad password + config.setAccessChecker(new AccessChecker(true)); + for (String path : new String[]{ + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + assertException("/test-documents/" + path, parser, context, EncryptedDocumentException.class); + } + + //now test documents that require this "user" password + assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_user.pdf", + parser, context, AccessPermissionException.class); + + + InputStream is = null; + try { + is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf"); + assertContains("Hello World", getText(is, parser, context)); + } finally { + IOUtils.closeQuietly(is); + } + + config.setAccessChecker(new AccessChecker(false)); + for (String path : new String[]{ + "testPDF_no_extract_no_accessibility_owner_user.pdf", + "testPDF_no_extract_yes_accessibility_owner_user.pdf", + }) { + assertException("/test-documents/" + path, parser, context, AccessPermissionException.class); + } + } + + @Test + public void testAccessCheckingOwnerPassword() throws Exception { + ParseContext context = new ParseContext(); + + PDFParserConfig config = new PDFParserConfig(); + //don't allow extraction, not even for accessibility + config.setAccessChecker(new AccessChecker(true)); + PasswordProvider passwordProvider = new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "owner"; + } + }; + + context.set(PasswordProvider.class, passwordProvider); + context.set(PDFParserConfig.class, config); + + Parser parser = new AutoDetectParser(); + //with owner's password, text can be extracted, no matter the AccessibilityChecker's settings + for (String path : new String[]{ + "testPDF_no_extract_no_accessibility_owner_user.pdf", + "testPDF_no_extract_yes_accessibility_owner_user.pdf", + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + + InputStream is = null; + try { + is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf"); + assertContains("Hello World", getText(is, parser, context)); + } finally { + IOUtils.closeQuietly(is); + } + } + + //really, with owner's password, all extraction is allowed + config.setAccessChecker(new AccessChecker(false)); + for (String path : new String[]{ + "testPDF_no_extract_no_accessibility_owner_user.pdf", + "testPDF_no_extract_yes_accessibility_owner_user.pdf", + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + + InputStream is = null; + try { + is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf"); + assertContains("Hello World", getText(is, parser, context)); + } finally { + IOUtils.closeQuietly(is); + } + } + } + + @Test + public void testPDFEncodedStringsInXMP() throws Exception { + //TIKA-1678 + XMLResult r = getXML("testPDF_PDFEncodedStringInXMP.pdf"); + assertEquals("Microsoft", r.metadata.get(TikaCoreProperties.TITLE)); + } + + private void assertException(String path, Parser parser, ParseContext context, Class expected) { + boolean noEx = false; + InputStream is = getResourceAsStream(path); + try { + String text = getText(is, parser, context); + noEx = true; + } catch (Exception e) { + assertEquals("Not the right exception: " + path, expected, e.getClass()); + } finally { + IOUtils.closeQuietly(is); + } + assertFalse(path + " should have thrown exception", noEx); + } + + /** + * Simple class to count end of document events. If functionality is useful, + * move to org.apache.tika in src/test + */ + private class EventCountingHandler extends ContentHandlerDecorator { + private int endDocument = 0; + + @Override + public void endDocument() { + endDocument++; + } + + public int getEndDocument() { + return endDocument; + } + } + + private class AvoidInlineSelector implements DocumentSelector { + + @Override + public boolean select(Metadata metadata) { + String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) { + return false; + } + return true; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/pom.xml?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/pom.xml Sat Jan 16 18:23:01 2016 @@ -0,0 +1,143 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-scientific-parser-module</artifactId> + <name>Apache Tika Scientific Parser Module</name> + <url>http://tika.apache.org/</url> + + <properties> + <netcdf-java.version>4.5.5</netcdf-java.version> + </properties> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-exec</artifactId> + <version>1.3</version> + </dependency> + <dependency> + <groupId>com.googlecode.json-simple</groupId> + <artifactId>json-simple</artifactId> + <version>1.1.1</version> + <exclusions> + <exclusion> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.sis.core</groupId> + <artifactId>sis-utility</artifactId> + <version>0.5</version> + </dependency> + <dependency> + <groupId>org.apache.sis.storage</groupId> + <artifactId>sis-netcdf</artifactId> + <version>0.5</version> + </dependency> + <dependency> + <groupId>org.apache.sis.core</groupId> + <artifactId>sis-metadata</artifactId> + <version>0.5</version> + </dependency> + <!-- edu.ucar dependencies --> + <dependency> + <groupId>edu.ucar</groupId> + <artifactId>netcdf4</artifactId> + <version>${netcdf-java.version}</version> + </dependency> + <dependency> + <groupId>edu.ucar</groupId> + <artifactId>grib</artifactId> + <version>${netcdf-java.version}</version> + </dependency> + <dependency> + <groupId>edu.ucar</groupId> + <artifactId>cdm</artifactId> + <version>${netcdf-java.version}</version> + <exclusions> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>jcl-over-slf4j</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>edu.ucar</groupId> + <artifactId>httpservices</artifactId> + <version>${netcdf-java.version}</version> + </dependency> + <!-- Apache cTAKES --> + <dependency> + <groupId>org.apache.ctakes</groupId> + <artifactId>ctakes-core</artifactId> + <version>3.2.2</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> + </dependency> + <!-- Upstream parser libraries --> + <dependency> + <groupId>net.sourceforge.jmatio</groupId> + <artifactId>jmatio</artifactId> + <version>1.0</version> + </dependency> + <!-- Apache Commons CSV --> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + <version>1.0</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-parser-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ctakes; + +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; + +/** + * This enumeration includes the properties that an {@see IdentifiedAnnotation} object can provide. + * + */ +public enum CTAKESAnnotationProperty { + BEGIN("start"), + END("end"), + CONDITIONAL("conditional"), + CONFIDENCE("confidence"), + DISCOVERY_TECNIQUE("discoveryTechnique"), + GENERIC("generic"), + HISTORY_OF("historyOf"), + ID("id"), + ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"), + POLARITY("polarity"); + + private String name; + + CTAKESAnnotationProperty(String name) { + this.name = name; + } + + public String getName() { + return name; + } +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ctakes; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.Serializable; +import java.util.Properties; + +import static org.apache.commons.io.output.NullOutputStream.NULL_OUTPUT_STREAM; + +/** + * Configuration for {@see CTAKESContentHandler}. + * + * This class allows to enable cTAKES and set its parameters. + */ +public class CTAKESConfig implements Serializable { + /** + * Serial version UID + */ + private static final long serialVersionUID = -1599741171775528923L; + + // Path to XML descriptor for AnalysisEngine + private String aeDescriptorPath = "/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml"; + + // UMLS username + private String UMLSUser = ""; + + // UMLS password + private String UMLSPass = ""; + + // Enables formatted output + private boolean prettyPrint = true; + + // Type of cTAKES (UIMA) serializer + private CTAKESSerializer serializerType = CTAKESSerializer.XMI; + + // OutputStream object used for CAS serialization + private OutputStream stream = NULL_OUTPUT_STREAM; + + // Enables CAS serialization + private boolean serialize = false; + + // Enables text analysis using cTAKES + private boolean text = true; + + // List of metadata to analyze using cTAKES + private String[] metadata = null; + + // List of annotation properties to add to metadata in addition to text covered by an annotation + private CTAKESAnnotationProperty[] annotationProps = null; + + // Character used to separate the annotation properties into metadata + private char separatorChar = ':'; + + /** + * Default constructor. + */ + public CTAKESConfig() { + init(this.getClass().getResourceAsStream("CTAKESConfig.properties")); + } + + /** + * Loads properties from InputStream and then tries to close InputStream. + * @param stream {@see InputStream} object used to read properties. + */ + public CTAKESConfig(InputStream stream) { + init(stream); + } + + private void init(InputStream stream) { + if (stream == null) { + return; + } + Properties props = new Properties(); + + try { + props.load(stream); + } catch (IOException e) { + // TODO warning + } finally { + if (stream != null) { + try { + stream.close(); + } catch (IOException ioe) { + // TODO warning + } + } + } + + setAeDescriptorPath(props.getProperty("aeDescriptorPath", getAeDescriptorPath())); + setUMLSUser(props.getProperty("UMLSUser", getUMLSUser())); + setUMLSPass(props.getProperty("UMLSPass", getUMLSPass())); + setText(Boolean.valueOf(props.getProperty("text", Boolean.toString(isText())))); + setMetadata(props.getProperty("metadata", getMetadataAsString()).split(",")); + setAnnotationProps(props.getProperty("annotationProps", getAnnotationPropsAsString()).split(",")); + setSeparatorChar(props.getProperty("separatorChar", Character.toString(getSeparatorChar())).charAt(0)); + } + + /** + * Returns the path to XML descriptor for AnalysisEngine. + * @return the path to XML descriptor for AnalysisEngine. + */ + public String getAeDescriptorPath() { + return aeDescriptorPath; + } + + /** + * Returns the UMLS username. + * @return the UMLS username. + */ + public String getUMLSUser() { + return UMLSUser; + } + + /** + * Returns the UMLS password. + * @return the UMLS password. + */ + public String getUMLSPass() { + return UMLSPass; + } + + /** + * Returns {@code true} if formatted output is enabled, {@code false} otherwise. + * @return {@code true} if formatted output is enabled, {@code false} otherwise. + */ + public boolean isPrettyPrint() { + return prettyPrint; + } + + /** + * Returns the type of cTAKES (UIMA) serializer used to write the CAS. + * @return the type of cTAKES serializer. + */ + public CTAKESSerializer getSerializerType() { + return serializerType; + } + + /** + * Returns an {@see OutputStream} object used write the CAS. + * @return {@see OutputStream} object used write the CAS. + */ + public OutputStream getOutputStream() { + return stream; + } + + /** + * Returns {@code true} if CAS serialization is enabled, {@code false} otherwise. + * @return {@code true} if CAS serialization output is enabled, {@code false} otherwise. + */ + public boolean isSerialize() { + return serialize; + } + + /** + * Returns {@code true} if content text analysis is enabled {@code false} otherwise. + * @return {@code true} if content text analysis is enabled {@code false} otherwise. + */ + public boolean isText() { + return text; + } + + /** + * Returns an array of metadata whose values will be analyzed using cTAKES. + * @return an array of metadata whose values will be analyzed using cTAKES. + */ + public String[] getMetadata() { + return metadata; + } + + /** + * Returns a string containing a comma-separated list of metadata whose values will be analyzed using cTAKES. + * @return a string containing a comma-separated list of metadata whose values will be analyzed using cTAKES. + */ + public String getMetadataAsString() { + if (metadata == null) { + return ""; + } + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < metadata.length; i++) { + sb.append(metadata[i]); + if (i < metadata.length-1) { + sb.append(","); + } + } + return sb.toString(); + } + + /** + * Returns an array of {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata. + * @return an array of {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata. + */ + public CTAKESAnnotationProperty[] getAnnotationProps() { + return annotationProps; + } + + /** + * Returns a string containing a comma-separated list of {@see CTAKESAnnotationProperty} names that will be included into cTAKES metadata. + * @return + */ + public String getAnnotationPropsAsString() { + StringBuilder sb = new StringBuilder(); + sb.append("coveredText"); + if (annotationProps != null) { + for (CTAKESAnnotationProperty property : annotationProps) { + sb.append(separatorChar); + sb.append(property.getName()); + } + } + return sb.toString(); + } + + /** + * Returns the separator character used for annotation properties. + * @return the separator character used for annotation properties. + */ + public char getSeparatorChar() { + return separatorChar; + } + + /** + * Sets the path to XML descriptor for AnalysisEngine. + * @param aeDescriptorPath the path to XML descriptor for AnalysisEngine. + */ + public void setAeDescriptorPath(String aeDescriptorPath) { + this.aeDescriptorPath = aeDescriptorPath; + } + + /** + * Sets the UMLS username. + * @param uMLSUser the UMLS username. + */ + public void setUMLSUser(String uMLSUser) { + this.UMLSUser = uMLSUser; + } + + /** + * Sets the UMLS password. + * @param uMLSPass the UMLS password. + */ + public void setUMLSPass(String uMLSPass) { + this.UMLSPass = uMLSPass; + } + + /** + * Enables the formatted output for serializer. + * @param prettyPrint {@true} to enable formatted output, {@code false} otherwise. + */ + public void setPrettyPrint(boolean prettyPrint) { + this.prettyPrint = prettyPrint; + } + + /** + * Sets the type of cTAKES (UIMA) serializer used to write CAS. + * @param serializerType the type of cTAKES serializer. + */ + public void setSerializerType(CTAKESSerializer serializerType) { + this.serializerType = serializerType; + } + + /** + * Sets the {@see OutputStream} object used to write the CAS. + * @param stream the {@see OutputStream} object used to write the CAS. + */ + public void setOutputStream(OutputStream stream) { + this.stream = stream; + } + + /** + * Enables CAS serialization. + * @param serialize {@true} to enable CAS serialization, {@code false} otherwise. + */ + public void setSerialize(boolean serialize) { + this.serialize = serialize; + } + + /** + * Enables content text analysis using cTAKES. + * @param text {@true} to enable content text analysis, {@code false} otherwise. + */ + public void setText(boolean text) { + this.text = text; + } + + /** + * Sets the metadata whose values will be analyzed using cTAKES. + * @param metadata the metadata whose values will be analyzed using cTAKES. + */ + public void setMetadata(String[] metadata) { + this.metadata = metadata; + } + + /** + * Sets the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata. + * @param annotationProps the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata. + */ + public void setAnnotationProps(CTAKESAnnotationProperty[] annotationProps) { + this.annotationProps = annotationProps; + } + + /** + * ets the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata. + * @param annotationProps the {@see CTAKESAnnotationProperty}'s that will be included into cTAKES metadata. + */ + public void setAnnotationProps(String[] annotationProps) { + CTAKESAnnotationProperty[] properties = new CTAKESAnnotationProperty[annotationProps.length]; + for (int i = 0; i < annotationProps.length; i++) { + properties[i] = CTAKESAnnotationProperty.valueOf(annotationProps[i]); + } + setAnnotationProps(properties); + } + + /** + * Sets the separator character used for annotation properties. + * @param separatorChar the separator character used for annotation properties. + */ + public void setSeparatorChar(char separatorChar) { + this.separatorChar = separatorChar; + } +} \ No newline at end of file
