TIKA-1986 -- revert parsecontext to ab7c325 and update PDFParser to handle non-primitive parameter setting
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/30b0f667 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/30b0f667 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/30b0f667 Branch: refs/heads/master Commit: 30b0f6674450f9aab4bacb5ab071ae96f4835c63 Parents: 0132037 Author: tballison <[email protected]> Authored: Wed Jun 15 14:54:25 2016 -0400 Committer: tballison <[email protected]> Committed: Wed Jun 15 14:54:25 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/parser/ParseContext.java | 56 ------------- .../tika/parser/pdf/AbstractPDF2XHTML.java | 10 +-- .../org/apache/tika/parser/pdf/PDFParser.java | 14 +++- .../apache/tika/parser/pdf/PDFParserConfig.java | 84 ++++++++++++++------ .../apache/tika/parser/pdf/PDFParserTest.java | 36 ++++----- .../parser/pdf/tika-config-non-primitives.xml | 29 +++++++ .../org/apache/tika/parser/pdf/tika-config.xml | 1 - 7 files changed, 118 insertions(+), 112 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/30b0f667/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 68d5038..2521cc9 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -29,11 +29,9 @@ import java.io.IOException; import java.io.Serializable; import java.io.StringReader; import java.lang.reflect.Method; -import java.util.Collections; import java.util.HashMap; import java.util.Map; -import org.apache.tika.config.Param; import org.apache.tika.exception.TikaException; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; @@ -56,13 +54,6 @@ public class ParseContext implements Serializable { /** Map of objects in this context */ private final Map<String, Object> context = new HashMap<String, Object>(); - private final static Map<String, Param<?>> EMPTY_PARAMS = Collections.EMPTY_MAP; - - /** - * Map of configurable arguments - */ - private final Map<String, Map<String, Param<?>>> params = new HashMap<>(); - private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() { public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { return new InputSource(new StringReader("")); @@ -205,53 +196,6 @@ public class ParseContext implements Serializable { } /** - * @param clazz class associated with given param name - * @param value value - */ - public void setParam(Class clazz, Param<?> value){ - Map<String, Param<?>> classParams = this.params.get(clazz.getName()); - if (classParams == null) { - classParams = new HashMap<>(); - } - classParams.put(value.getName(), value); - this.params.put(clazz.getName(), classParams); - } - - /** - * Gets the value associated with given class and parameter - * @param clazz class - * @param key parameter name - * @return param value or null if the clazz or key doesn't exist - */ - public Param<?> getParam(Class clazz, String key) { - Map<String, Param<?>> classParams = this.params.get(clazz.getName()); - if (classParams != null) { - return classParams.get(key); - } - return null; - } - - /** - * Gets all the params for the specified class - * @param clazz class for which to grab the params - * @return map of key values or null if nothing has been specified - */ - public Map<String, Param<?>> getParams(Class clazz) { - if (params.containsKey(clazz.getName())) { - return params.get(clazz.getName()); - } - return EMPTY_PARAMS; - } - - /** - * Checks if parameter is available - * @param key parameter name - * @return true if parameter is available, false otherwise - */ - public boolean hasParam(Class clazz, String key){ - return params.containsKey(clazz) && params.get(clazz.getName()).containsKey(key); - } - /** * Returns the DOM builder factory specified in this parsing context. * If a factory is not explicitly specified, then a default factory * instance is created and returned. The default factory instance is http://git-wip-us.apache.org/repos/asf/tika/blob/30b0f667/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index d8a46a2..53463fb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -245,7 +245,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { } void doOCROnCurrentPage() throws IOException, TikaException, SAXException { - if (config.getOCRStrategy().equals(NO_OCR)) { + if (config.getOcrStrategy().equals(NO_OCR)) { return; } TesseractOCRConfig tesseractConfig = @@ -260,12 +260,12 @@ class AbstractPDF2XHTML extends PDFTextStripper { PDFRenderer renderer = new PDFRenderer(pdDocument); TemporaryResources tmp = new TemporaryResources(); try { - BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType()); + BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType()); Path tmpFile = tmp.createTempFile(); try (OutputStream os = Files.newOutputStream(tmpFile)) { //TODO: get output format from TesseractConfig - ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), - os, config.getOCRDPI()); + ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), + os, config.getOcrDPI()); } try (InputStream is = TikaInputStream.get(tmpFile)) { tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); @@ -350,7 +350,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } } - if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { + if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { doOCROnCurrentPage(); } xhtml.endElement("div"); http://git-wip-us.apache.org/repos/asf/tika/blob/30b0f667/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 7b12d58..a51ba1c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -39,7 +39,6 @@ import org.apache.pdfbox.pdmodel.common.PDMetadata; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.apache.tika.config.Field; -import org.apache.tika.config.Param; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -57,7 +56,6 @@ import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.image.xmp.JempboxExtractor; import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.utils.AnnotationUtils; import org.w3c.dom.Document; import org.xml.sax.ContentHandler; import org.xml.sax.ErrorHandler; @@ -138,11 +136,11 @@ public class PDFParser extends AbstractParser { if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); - } else if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { + } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } else { - if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { + if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); } PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); @@ -597,7 +595,15 @@ public class PDFParser extends AbstractParser { defaultConfig.setSortByPosition(v); } + @Field + public void setOcrStrategy(String ocrStrategyString) { + defaultConfig.setOcrStrategy(ocrStrategyString); + } + @Field + public void setOcrImageType(String imageType) { + defaultConfig.setOcrImageType(imageType); + } //can return null! private Document loadDOM(PDMetadata pdMetadata, ParseContext context) { if (pdMetadata == null) { http://git-wip-us.apache.org/repos/asf/tika/blob/30b0f667/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 3f8555c..014ae7f 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -61,8 +61,18 @@ public class PDFParserConfig implements Serializable { } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) { return OCR_AND_TEXT_EXTRACTION; } - //default -- no ocr - return NO_OCR; + StringBuilder sb = new StringBuilder(); + sb.append("I regret that I don't recognize '").append(s); + sb.append("' as an OCR_STRATEGY. I only recognize:"); + int i = 0; + for (OCR_STRATEGY strategy : OCR_STRATEGY.values()) { + if (i++ > 0) { + sb.append(", "); + } + sb.append(strategy.toString()); + + } + throw new IllegalArgumentException(sb.toString()); } } @@ -180,13 +190,13 @@ public class PDFParserConfig implements Serializable { getBooleanProp(props.getProperty("catchIntermediateIOExceptions"), isCatchIntermediateIOExceptions())); - setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy"))); + setOcrStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy"))); - setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI())); + setOcrDPI(getIntProp(props.getProperty("ocrDPI"), getOcrDPI())); - setOCRImageFormatName(props.getProperty("ocrImageFormatName")); + setOcrImageFormatName(props.getProperty("ocrImageFormatName")); - setOCRImageType(parseImageType(props.getProperty("ocrImageType"))); + setOcrImageType(parseImageType(props.getProperty("ocrImageType"))); boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false); @@ -450,15 +460,22 @@ public class PDFParserConfig implements Serializable { * Which strategy to use for OCR * @param ocrStrategy */ - public void setOCRStrategy(OCR_STRATEGY ocrStrategy) { + public void setOcrStrategy(OCR_STRATEGY ocrStrategy) { this.ocrStrategy = ocrStrategy; } /** + * Which strategy to use for OCR + * @param ocrStrategyString + */ + public void setOcrStrategy(String ocrStrategyString) { + this.ocrStrategy = OCR_STRATEGY.parse(ocrStrategyString); + } + /** * * @return strategy to use for OCR */ - public OCR_STRATEGY getOCRStrategy() { + public OCR_STRATEGY getOcrStrategy() { return ocrStrategy; } @@ -489,26 +506,26 @@ public class PDFParserConfig implements Serializable { * the page image for OCR (examples: png, tiff, jpeg) * @return */ - public String getOCRImageFormatName() { + public String getOcrImageFormatName() { return ocrImageFormatName; } /** - * @see #getOCRImageFormatName() + * @see #getOcrImageFormatName() * * @param ocrImageFormatName name of image format used to render * page image */ - public void setOCRImageFormatName(String ocrImageFormatName) { + public void setOcrImageFormatName(String ocrImageFormatName) { this.ocrImageFormatName = ocrImageFormatName; } /** * Image type used to render the page image for OCR. - * @see #setOCRImageType(ImageType) + * @see #setOcrImageType(ImageType) * @return image type */ - public ImageType getOCRImageType() { + public ImageType getOcrImageType() { return ocrImageType; } @@ -516,15 +533,23 @@ public class PDFParserConfig implements Serializable { * Image type used to render the page image for OCR. * @param ocrImageType */ - public void setOCRImageType(ImageType ocrImageType) { + public void setOcrImageType(ImageType ocrImageType) { this.ocrImageType = ocrImageType; } /** + * Image type used to render the page image for OCR. + * @see #setOcrImageType(ImageType) + */ + public void setOcrImageType(String ocrImageTypeString) { + this.ocrImageType = parseImageType(ocrImageTypeString); + } + + /** * Dots per inch used to render the page image for OCR * @return dots per inch */ - public int getOCRDPI() { + public int getOcrDPI() { return ocrDPI; } @@ -532,7 +557,7 @@ public class PDFParserConfig implements Serializable { * Dots per inche used to render the page image for OCR * @param ocrDPI */ - public void setOCRDPI(int ocrDPI) { + public void setOcrDPI(int ocrDPI) { this.ocrDPI = ocrDPI; } @@ -542,7 +567,18 @@ public class PDFParserConfig implements Serializable { return t; } } - return null; + StringBuilder sb = new StringBuilder(); + sb.append("I regret that I could not parse '"); + sb.append(ocrImageType); + sb.append("'. I'm only familiar with: "); + int i = 0; + for (ImageType t : ImageType.values()) { + if (i++ == 0) { + sb.append(", "); + } + sb.append(t.toString()); + } + throw new IllegalArgumentException(sb.toString()); } @Override @@ -560,13 +596,13 @@ public class PDFParserConfig implements Serializable { if (getExtractInlineImages() != config.getExtractInlineImages()) return false; if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false; if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false; - if (getOCRDPI() != config.getOCRDPI()) return false; + if (getOcrDPI() != config.getOcrDPI()) return false; if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false; if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false; if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false; - if (!getOCRStrategy().equals(config.getOCRStrategy())) return false; - if (getOCRImageType() != config.getOCRImageType()) return false; - if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) return false; + if (!getOcrStrategy().equals(config.getOcrStrategy())) return false; + if (getOcrImageType() != config.getOcrImageType()) return false; + if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false; return getAccessChecker().equals(config.getAccessChecker()); } @@ -584,9 +620,9 @@ public class PDFParserConfig implements Serializable { result = 31 * result + getSpacingTolerance().hashCode(); result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0); result = 31 * result + ocrStrategy.hashCode(); - result = 31 * result + getOCRDPI(); - result = 31 * result + getOCRImageType().hashCode(); - result = 31 * result + getOCRImageFormatName().hashCode(); + result = 31 * result + getOcrDPI(); + result = 31 * result + getOcrImageType().hashCode(); + result = 31 * result + getOcrImageFormatName().hashCode(); result = 31 * result + getAccessChecker().hashCode(); result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0); return result; http://git-wip-us.apache.org/repos/asf/tika/blob/30b0f667/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index e9f55fe..a9ef000 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -16,7 +16,6 @@ */ package org.apache.tika.parser.pdf; -import static org.bouncycastle.crypto.tls.CipherType.stream; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -34,8 +33,8 @@ import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; +import org.apache.pdfbox.rendering.ImageType; import org.apache.tika.TikaTest; -import org.apache.tika.config.Param; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; @@ -49,11 +48,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPMM; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.PasswordProvider; -import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.*; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.sax.BasicContentHandlerFactory; @@ -61,7 +56,6 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.junit.AfterClass; import org.junit.BeforeClass; -import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -1198,7 +1192,7 @@ public class PDFParserTest extends TikaTest { for (PDFParserConfig.OCR_STRATEGY strategy : PDFParserConfig.OCR_STRATEGY.values()) { PDFParserConfig config = new PDFParserConfig(); - config.setOCRStrategy(strategy); + config.setOcrStrategy(strategy); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); context.set(Parser.class, new AutoDetectParser()); @@ -1232,19 +1226,17 @@ public class PDFParserTest extends TikaTest { } @Test - @Ignore("We've turned this off for now") - public void testParameterizationViaContext() throws Exception { - ParseContext context = new ParseContext(); - - Param<Boolean> paramVal = new Param<>("sortByPosition", new Boolean(true)); - context.setParam(PDFParser.class, paramVal); - - Parser p = new AutoDetectParser(); - String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p, context); - text = text.replaceAll("\\s+", " "); - - // Column text is now interleaved: - assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text); + public void testInitializationOfNonPrimitivesViaConfig() throws Exception { + InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-config-non-primitives.xml"); + assertNotNull(is); + TikaConfig tikaConfig = new TikaConfig(is); + AutoDetectParser p = new AutoDetectParser(tikaConfig); + Map<MediaType, Parser> parsers = p.getParsers(); + Parser composite = parsers.get(MediaType.application("pdf")); + Parser pdfParser = ((CompositeParser)composite).getParsers().get(MediaType.application("pdf")); + assertEquals("org.apache.tika.parser.pdf.PDFParser", pdfParser.getClass().getName()); + assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY, ((PDFParser)pdfParser).getPDFParserConfig().getOcrStrategy()); + assertEquals(ImageType.RGB, ((PDFParser)pdfParser).getPDFParserConfig().getOcrImageType()); } http://git-wip-us.apache.org/repos/asf/tika/blob/30b0f667/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml new file mode 100644 index 0000000..3cc9d8b --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config-non-primitives.xml @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="sortByPosition" type="bool">true</param> + <param name="ocrImageType" type="string">rgb</param> + <param name="ocrStrategy" type="string">ocr_only</param> + + </params> + </parser> + </parsers> +</properties> http://git-wip-us.apache.org/repos/asf/tika/blob/30b0f667/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml index 0b965c7..98940da 100644 --- a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml @@ -22,6 +22,5 @@ <param name="sortByPosition" type="bool">true</param> </params> </parser> - </parsers> </properties>
