Repository: tika Updated Branches: refs/heads/master ab53bdc75 -> 7dda921de
TIKA-2173 add other setters to PDFParser so that they can be configured Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7dda921d Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7dda921d Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7dda921d Branch: refs/heads/master Commit: 7dda921de92c5ded7b62f431247bb2d7e8f06d45 Parents: ab53bdc Author: tballison <[email protected]> Authored: Tue Nov 8 12:57:18 2016 -0500 Committer: tballison <[email protected]> Committed: Tue Nov 8 12:57:18 2016 -0500 ---------------------------------------------------------------------- .../src/test/java/org/apache/tika/TikaTest.java | 9 +++++ .../apache/tika/parser/pdf/AccessChecker.java | 19 +++++++++++ .../org/apache/tika/parser/pdf/PDFParser.java | 36 ++++++++++++++++++++ .../apache/tika/parser/pdf/PDFParserConfig.java | 16 ++++++--- .../apache/tika/parser/pdf/PDFParserTest.java | 22 ++++++++++++ .../tika/parser/pdf/tika-inline-config.xml | 20 +++++++++++ 6 files changed, 118 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/7dda921d/tika-core/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 0bc5a83..462c1e5 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -204,6 +204,15 @@ public abstract class TikaTest { return wrapper.getMetadata(); } + protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception { + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { + wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); + } + return wrapper.getMetadata(); + } + /** * Basic text extraction. * <p> http://git-wip-us.apache.org/repos/asf/tika/blob/7dda921d/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java index 775e590..0bb6590 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java @@ -78,4 +78,23 @@ public class AccessChecker implements Serializable { throw new AccessPermissionException("Content extraction is not allowed."); } } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + AccessChecker checker = (AccessChecker) o; + + if (needToCheck != checker.needToCheck) return false; + return allowAccessibility == checker.allowAccessibility; + + } + + @Override + public int hashCode() { + int result = (needToCheck ? 1 : 0); + result = 31 * result + (allowAccessibility ? 1 : 0); + return result; + } } http://git-wip-us.apache.org/repos/asf/tika/blob/7dda921d/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 8237ea4..747c9c3 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -620,6 +620,42 @@ public class PDFParser extends AbstractParser { public void setOcrImageType(String imageType) { defaultConfig.setOcrImageType(imageType); } + + @Field + void setOcrDPI(int dpi) { + defaultConfig.setOcrDPI(dpi); + } + + @Field + void setExtractInlineImages(boolean extractInlineImages) { + defaultConfig.setExtractInlineImages(extractInlineImages); + } + + @Field + void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) { + defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions); + } + + @Field + void setExtractAcroFormContent(boolean extractAcroFormContent) { + defaultConfig.setExtractAcroFormContent(extractAcroFormContent); + } + + @Field + void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) { + defaultConfig.setIfXFAExtractOnlyXFA(ifXFAExtractOnlyXFA); + } + + @Field + void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) { + defaultConfig.setAccessChecker(new AccessChecker(allowExtractionForAccessibility)); + } + + @Field + void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { + defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly); + } + //can return null! private Document loadDOM(PDMetadata pdMetadata, ParseContext context) { if (pdMetadata == null) { http://git-wip-us.apache.org/repos/asf/tika/blob/7dda921d/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 014ae7f..e9eb6e5 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -125,7 +125,7 @@ public class PDFParserConfig implements Serializable { //with a streams. If this is set to true, Tika's //parser catches these exceptions, reports them in the metadata //and then throws the first stored exception after the parse has completed. - private boolean isCatchIntermediateIOExceptions = true; + private boolean catchIntermediateIOExceptions = true; public PDFParserConfig() { init(this.getClass().getResourceAsStream("PDFParser.properties")); @@ -439,12 +439,20 @@ public class PDFParserConfig implements Serializable { /** * See {@link #setCatchIntermediateIOExceptions(boolean)} * @return whether or not to catch IOExceptions + * @deprecated use {@link #getCatchIntermediateIOExceptions()} */ public boolean isCatchIntermediateIOExceptions() { - return isCatchIntermediateIOExceptions; + return catchIntermediateIOExceptions; } /** + * See {@link #setCatchIntermediateIOExceptions(boolean)} + * @return whether or not to catch IOExceptions + */ + public boolean getCatchIntermediateIOExceptions() { + return catchIntermediateIOExceptions; + } + /** * The PDFBox parser will throw an IOException if there is * a problem with a stream. If this is set to <code>true</code>, * Tika's PDFParser will catch these exceptions and try to parse @@ -453,7 +461,7 @@ public class PDFParserConfig implements Serializable { * @param catchIntermediateIOExceptions */ public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) { - isCatchIntermediateIOExceptions = catchIntermediateIOExceptions; + this.catchIntermediateIOExceptions = catchIntermediateIOExceptions; } /** @@ -646,7 +654,7 @@ public class PDFParserConfig implements Serializable { ", ocrImageType=" + ocrImageType + ", ocrImageFormatName='" + ocrImageFormatName + '\'' + ", accessChecker=" + accessChecker + - ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions + + ", catchIntermediateIOExceptions=" + catchIntermediateIOExceptions + '}'; } } http://git-wip-us.apache.org/repos/asf/tika/blob/7dda921d/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 9ab0d74..1f0f4d6 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1270,6 +1270,28 @@ public class PDFParserTest extends TikaTest { assertContains("Tika - Content", content); } + @Test + public void testConfiguringMoreParams() throws Exception { + try (InputStream configIs = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-inline-config.xml")) { + assertNotNull(configIs); + TikaConfig tikaConfig = new TikaConfig(configIs); + AutoDetectParser p = new AutoDetectParser(tikaConfig); + //make absolutely certain the functionality works! + List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p); + assertEquals(2, metadata.size()); + Map<MediaType, Parser> parsers = p.getParsers(); + Parser composite = parsers.get(MediaType.application("pdf")); + Parser pdfParser = ((CompositeParser)composite).getParsers().get(MediaType.application("pdf")); + assertTrue(pdfParser instanceof PDFParser); + PDFParserConfig pdfParserConfig = ((PDFParser)pdfParser).getPDFParserConfig(); + assertEquals(new AccessChecker(true), pdfParserConfig.getAccessChecker()); + assertEquals(true, pdfParserConfig.getExtractInlineImages()); + assertEquals(false, pdfParserConfig.getExtractUniqueInlineImagesOnly()); + assertEquals(314159, pdfParserConfig.getOcrDPI()); + assertEquals(false, pdfParserConfig.getCatchIntermediateIOExceptions()); + } + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path); http://git-wip-us.apache.org/repos/asf/tika/blob/7dda921d/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml new file mode 100644 index 0000000..9436604 --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> + </parser> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="extractInlineImages" type="bool">true</param> + <param name="allowExtractionForAccessibility" type="bool">true</param> + <param name="catchIntermediateExceptions" type="bool">false</param> + <param name="extractUniqueInlineImagesOnly" type="bool">false</param> + <param name="catchIntermediateExceptions" type="bool">false</param> + <param name="ocrDPI" type="int">314159</param> + <!-- we really should throw an exception for this!! --> + <param name="someRandomThingOrOther" type="bool">true</param> + </params> + </parser> + </parsers> +</properties>
