Repository: tika Updated Branches: refs/heads/2.x bcd59cee7 -> ab009aeb7
TIKA-2173 - first steps. Need to integrate parameter configuration into 2.x before I can do the rest Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7422218e Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7422218e Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7422218e Branch: refs/heads/2.x Commit: 7422218eb6e76a4f5744cd85c53d08e629fe7976 Parents: bcd59ce Author: tballison <[email protected]> Authored: Wed Nov 9 12:49:32 2016 -0500 Committer: tballison <[email protected]> Committed: Wed Nov 9 12:49:32 2016 -0500 ---------------------------------------------------------------------- .../src/test/java/org/apache/tika/TikaTest.java | 9 +++++ .../apache/tika/parser/pdf/AccessChecker.java | 19 ++++++++++ .../org/apache/tika/parser/pdf/PDFParser.java | 37 +++++++++++++++++++- .../apache/tika/parser/pdf/PDFParserConfig.java | 16 ++++++--- .../apache/tika/parser/pdf/PDFParserTest.java | 27 ++++++++++++++ .../tika/parser/pdf/tika-inline-config.xml | 20 +++++++++++ 6 files changed, 123 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-core/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index cfed800..0f6303e 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -217,6 +217,15 @@ public abstract class TikaTest { } } + protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception { + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { + wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); + } + return wrapper.getMetadata(); + } + /** * Basic text extraction. * <p> http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java index 775e590..0bb6590 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java @@ -78,4 +78,23 @@ public class AccessChecker implements Serializable { throw new AccessPermissionException("Content extraction is not allowed."); } } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + AccessChecker checker = (AccessChecker) o; + + if (needToCheck != checker.needToCheck) return false; + return allowAccessibility == checker.allowAccessibility; + + } + + @Override + public int hashCode() { + int result = (needToCheck ? 1 : 0); + result = 31 * result + (allowAccessibility ? 1 : 0); + return result; + } } http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 763c82b..185af6a 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -613,10 +613,45 @@ public class PDFParser extends AbstractParser { * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ - public void setSortByPosition(boolean v) { + void setSortByPosition(boolean v) { defaultConfig.setSortByPosition(v); } +/* void setOcrStrategy(String ocrStrategyString) { + defaultConfig.setOcrStrategy(ocrStrategyString); + } + + void setOcrImageType(String imageType) { + defaultConfig.setOcrImageType(imageType); + } + + void setOcrDPI(int dpi) { + defaultConfig.setOcrDPI(dpi); + } +*/ + void setExtractInlineImages(boolean extractInlineImages) { + defaultConfig.setExtractInlineImages(extractInlineImages); + } + + void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) { + defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions); + } + + void setExtractAcroFormContent(boolean extractAcroFormContent) { + defaultConfig.setExtractAcroFormContent(extractAcroFormContent); + } + + void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) { + defaultConfig.setIfXFAExtractOnlyXFA(ifXFAExtractOnlyXFA); + } + + void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) { + defaultConfig.setAccessChecker(new AccessChecker(allowExtractionForAccessibility)); + } + + void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { + defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly); + } //can return null! private Document loadDOM(PDMetadata pdMetadata, ParseContext context) { http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 296b191..cf43864 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -113,7 +113,7 @@ public class PDFParserConfig implements Serializable { //with a streams. If this is set to true, Tika's //parser catches these exceptions, reports them in the metadata //and then throws the first stored exception after the parse has completed. - private boolean isCatchIntermediateIOExceptions = true; + private boolean catchIntermediateIOExceptions = true; public PDFParserConfig() { init(this.getClass().getResourceAsStream("PDFParser.properties")); @@ -427,12 +427,20 @@ public class PDFParserConfig implements Serializable { /** * See {@link #setCatchIntermediateIOExceptions(boolean)} * @return whether or not to catch IOExceptions + * @deprecated use {@link #getCatchIntermediateIOExceptions()} */ public boolean isCatchIntermediateIOExceptions() { - return isCatchIntermediateIOExceptions; + return catchIntermediateIOExceptions; } /** + * See {@link #setCatchIntermediateIOExceptions(boolean)} + * @return whether or not to catch IOExceptions + */ + public boolean getCatchIntermediateIOExceptions() { + return catchIntermediateIOExceptions; + } + /** * The PDFBox parser will throw an IOException if there is * a problem with a stream. If this is set to <code>true</code>, * Tika's PDFParser will catch these exceptions and try to parse @@ -441,7 +449,7 @@ public class PDFParserConfig implements Serializable { * @param catchIntermediateIOExceptions */ public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) { - isCatchIntermediateIOExceptions = catchIntermediateIOExceptions; + this.catchIntermediateIOExceptions = catchIntermediateIOExceptions; } /** @@ -608,7 +616,7 @@ public class PDFParserConfig implements Serializable { ", ocrImageType=" + ocrImageType + ", ocrImageFormatName='" + ocrImageFormatName + '\'' + ", accessChecker=" + accessChecker + - ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions + + ", isCatchIntermediateIOExceptions=" + catchIntermediateIOExceptions + '}'; } } http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index dda0712..2292157 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -19,6 +19,7 @@ package org.apache.tika.parser.pdf; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -34,6 +35,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.tika.Tika; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -48,6 +50,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPMM; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; @@ -59,6 +62,7 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.junit.AfterClass; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -1220,6 +1224,29 @@ public class PDFParserTest extends TikaTest { assertContains("Tika - Content", content); } + @Test + @Ignore("until we add parameter mods") + public void testConfiguringMoreParams() throws Exception { + try (InputStream configIs = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-inline-config.xml")) { + assertNotNull(configIs); + TikaConfig tikaConfig = new TikaConfig(configIs); + AutoDetectParser p = new AutoDetectParser(tikaConfig); + //make absolutely certain the functionality works! + List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p); + assertEquals(2, metadata.size()); + Map<MediaType, Parser> parsers = p.getParsers(); + Parser composite = parsers.get(MediaType.application("pdf")); + Parser pdfParser = ((CompositeParser)composite).getParsers().get(MediaType.application("pdf")); + assertTrue(pdfParser instanceof PDFParser); + PDFParserConfig pdfParserConfig = ((PDFParser)pdfParser).getPDFParserConfig(); + assertEquals(new AccessChecker(true), pdfParserConfig.getAccessChecker()); + assertEquals(true, pdfParserConfig.getExtractInlineImages()); + assertEquals(false, pdfParserConfig.getExtractUniqueInlineImagesOnly()); + //assertEquals(314159, pdfParserConfig.getOcrDPI()); + assertEquals(false, pdfParserConfig.getCatchIntermediateIOExceptions()); + } + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path); http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml b/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml new file mode 100644 index 0000000..9436604 --- /dev/null +++ b/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> + </parser> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="extractInlineImages" type="bool">true</param> + <param name="allowExtractionForAccessibility" type="bool">true</param> + <param name="catchIntermediateExceptions" type="bool">false</param> + <param name="extractUniqueInlineImagesOnly" type="bool">false</param> + <param name="catchIntermediateExceptions" type="bool">false</param> + <param name="ocrDPI" type="int">314159</param> + <!-- we really should throw an exception for this!! --> + <param name="someRandomThingOrOther" type="bool">true</param> + </params> + </parser> + </parsers> +</properties>
