This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6b237ac3cdb0b43b86884248f14424e217e19ea2 Author: tallison <[email protected]> AuthorDate: Mon Oct 21 10:24:42 2019 -0400 TIKA-2970 -- ensure that configuration of the tesseract parser is respected by the PDFParser --- CHANGES.txt | 3 ++ .../apache/tika/parser/ocr/TesseractOCRParser.java | 5 +++ .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 9 ++++-- .../org/apache/tika/parser/pdf/PDFParserTest.java | 27 +++++++++++++++- .../org/apache/tika/parser/pdf/tika-ocr-config.xml | 36 ++++++++++++++++++++++ 5 files changed, 77 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 8bdc330..59e1e1b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -8,6 +8,9 @@ Release 2.0.0 - ??? Release 1.23 * Upgrade to PDFBox 2.0.17 + * Ensure that the PDFParser respects custom configuration of Tesseract + from tika-config.xml via Eric Pugh (TIKA-2970). + Release 1.22 - ??? * NOTE: Known regression: PDFBOX-4587 -- PDF passwords with codepoints diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 655aaf8..20a3fd1 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -716,6 +716,11 @@ public class TesseractOCRParser extends AbstractParser implements Initializable } @Field + public void setMaxFileSizeToOcr(long maxFileSizeToOcr) { + defaultConfig.setMaxFileSizeToOcr(maxFileSizeToOcr); + } + + @Field public void setMinFileSizeToOcr(long minFileSizeToOcr) { defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index f15c71b..f17d289 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -150,6 +150,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { final Metadata metadata; final EmbeddedDocumentExtractor embeddedDocumentExtractor; final PDFParserConfig config; + final TesseractOCRParser tesseractOCRParser;//can be null! //zero-based pageIndex int pageIndex = 0; @@ -165,6 +166,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { this.metadata = metadata; this.config = config; embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + if (config.getOcrStrategy() == NO_OCR) { + tesseractOCRParser = null; + } else { + tesseractOCRParser = (TesseractOCRParser)EmbeddedDocumentUtil.tryToFindExistingLeafParser(TesseractOCRParser.class, context); + } } @Override @@ -322,9 +328,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { return; } TesseractOCRConfig tesseractConfig = - context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); + context.get(TesseractOCRConfig.class, tesseractOCRParser.getDefaultConfig()); - TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); if (! tesseractOCRParser.hasTesseract(tesseractConfig)) { throw new TikaException("Tesseract is not available. "+ "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 15aec20..1337b25 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -33,6 +33,7 @@ import java.util.Map; import java.util.Set; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.pdfbox.rendering.ImageType; @@ -1217,6 +1218,7 @@ public class PDFParserTest extends TikaTest { assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length); assertNotContained("1309.61", content); } + @Test public void testEmbeddedJPEG() throws Exception { //TIKA-1990, test that an embedded jpeg is correctly decoded @@ -1289,6 +1291,30 @@ public class PDFParserTest extends TikaTest { assertContains("Norconex", xmlResult.xml); } + @Test + public void testTesseractInitializationWorks() throws Exception { + //TIKA-2970 -- make sure that configurations set on the TesseractOCRParser + //make it through to when the TesseractOCRParser is called via + //the PDFParser + if (!canRunOCR()) { + return; + } + + //via the config, tesseract should skip this file because it is too large + InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-ocr-config.xml"); + assertNotNull(is); + TikaConfig tikaConfig = new TikaConfig(is); + Parser p = new AutoDetectParser(tikaConfig); + String text = getText(getResourceAsStream("/test-documents/testOCR.pdf"), p); + assertTrue(StringUtils.isAllBlank(text)); + + //now override the max file size to ocr, and you should get text + ParseContext pc = new ParseContext(); + TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig(); + pc.set(TesseractOCRConfig.class, tesseractOCRConfig); + text = getText(getResourceAsStream("/test-documents/testOCR.pdf"), p, pc); + assertContains("Happy", text); + } @Test public void testInitializationViaConfig() throws Exception { @@ -1301,7 +1327,6 @@ public class PDFParserTest extends TikaTest { // Column text is now interleaved: assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text); - } @Test diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml new file mode 100644 index 0000000..e187601 --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-ocr-config.xml @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> + <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> + </parser> + <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"> + <params> + <param name="maxFileSizeToOcr" type="long">100</param> + </params> + </parser> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="extractInlineImages" type="bool">false</param> + <param name="ocrStrategy" type="string">ocr_only</param> + </params> + </parser> + </parsers> +</properties>
