This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4bd897a7df772c208fb6918b6b7559e37e5ec3b9 Author: tallison <[email protected]> AuthorDate: Wed Jan 13 14:39:27 2021 -0500 TIKA-3258 -- in Tika 2.0.0, the default for OCR'ing of PDFs is 'auto' --- CHANGES.txt | 3 + .../apache/tika/parser/ocr/TesseractOCRParser.java | 36 +++------ .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 26 ++++--- .../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 2 +- .../apache/tika/parser/pdf/PDFParserConfig.java | 2 +- .../apache/tika/parser/pdf/PDFParser.properties | 2 +- .../org/apache/tika/parser/pdf/PDFParserTest.java | 88 ++++++++++++++-------- 7 files changed, 89 insertions(+), 70 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 6a9d1a6..0a12cb1 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,9 @@ Release 2.0.0-ALPHA - 01/13/2021 BREAKING CHANGES in 2.0.0 * General + * OCR is now triggered automatically for PDFs if tesseract + is on the user's path see (https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr) + for how to disable OCR. * Remove deprecated Metadata keys/properties (TIKA-1974). * Removed dangerous calls to read an inputstream or convert to bytes without specifying a charset diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 28fb5b8..264295c 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -88,7 +88,7 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML; * * */ -public class TesseractOCRParser extends AbstractParser implements Initializable { +public class TesseractOCRParser extends AbstractParser { public static final String TESS_META = "tess:"; public static final Property IMAGE_ROTATION = Property.externalRealSeq(TESS_META+"rotation"); public static final Property IMAGE_MAGICK = Property.externalBooleanSeq(TESS_META+"image_magick_processed"); @@ -230,6 +230,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable ContentHandler xhtml, Metadata metadata, ParseContext parseContext, TesseractOCRConfig config) throws IOException, SAXException, TikaException { + warnOnFirstParse(); File tmpTxtOutput = null; try { Path input = tikaInputStream.getPath(); @@ -284,32 +285,9 @@ public class TesseractOCRParser extends AbstractParser implements Initializable } } - /** - * no-op - * @param params params to use for initialization - * @throws TikaConfigException - */ - @Override - public void initialize(Map<String, Param> params) throws TikaConfigException { - - } - - @Override - public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - //this will incorrectly trigger for people who turn off Tesseract - //by sending in a bogus tesseract path via a custom TesseractOCRConfig. - //TODO: figure out how to solve that. - if (! hasWarned()) { - if (hasTesseract(defaultConfig)) { - problemHandler.handleInitializableProblem(this.getClass().getName(), - "Tesseract OCR is installed and will be automatically applied to image files unless\n" + - "you've excluded the TesseractOCRParser from the default parser.\n"+ - "Tesseract may dramatically slow down content extraction (TIKA-2359).\n" + - "As of Tika 1.15 (and prior versions), Tesseract is automatically called.\n" + - "In future versions of Tika, users may need to turn the TesseractOCRParser on via TikaConfig."); - warn(); - } + private void warnOnFirstParse() { + if (!hasWarned()) { + warn(); } } @@ -515,6 +493,10 @@ public class TesseractOCRParser extends AbstractParser implements Initializable } protected void warn() { + LOG.info("Tesseract is installed and is being invoked. " + + "This can add greatly to processing time. If you do not want tesseract " + + "to be applied to your files see: " + + "https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr"); HAS_WARNED = true; } diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index f60e53c..90ef86b 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -16,7 +16,10 @@ */ package org.apache.tika.parser.pdf; +import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.AUTO; import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; +import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION; +import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.OCR_ONLY; import javax.xml.stream.XMLStreamException; import java.awt.image.BufferedImage; @@ -422,18 +425,23 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } - void doOCROnCurrentPage() throws IOException, TikaException, SAXException { - if (config.getOcrStrategy().equals(NO_OCR)) { + void doOCROnCurrentPage(PDFParserConfig.OCR_STRATEGY ocrStrategy) throws IOException, TikaException, SAXException { + if (ocrStrategy.equals(NO_OCR)) { return; } MediaType ocrImageMediaType = MediaType.image("ocr-"+config.getOcrImageFormatName()); if (! ocrParser.getSupportedTypes(context).contains(ocrImageMediaType)) { - throw new TikaException("" + - "I regret that I couldn't find an OCR parser to handle "+ocrImageMediaType+"."+ - "Please set the OCR_STRATEGY to NO_OCR or configure your" + - "OCR parser correctly" - ); + if (ocrStrategy == OCR_ONLY || ocrStrategy == OCR_AND_TEXT_EXTRACTION) { + throw new TikaException("" + + "I regret that I couldn't find an OCR parser to handle " + ocrImageMediaType + "." + + "Please set the OCR_STRATEGY to NO_OCR or configure your" + + "OCR parser correctly" + ); + } else if (ocrStrategy == AUTO) { + //silently skip + return; + } } PDFRenderer renderer = new PDFRenderer(pdDocument); @@ -544,11 +552,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { - doOCROnCurrentPage(); + doOCROnCurrentPage(OCR_AND_TEXT_EXTRACTION); } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) { //TODO add more sophistication if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) { - doOCROnCurrentPage(); + doOCROnCurrentPage(AUTO); } } diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java index 539cd50..4eb849e 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java @@ -94,7 +94,7 @@ class OCR2XHTML extends AbstractPDF2XHTML { public void processPage(PDPage pdPage) throws IOException { try { startPage(pdPage); - doOCROnCurrentPage(); + doOCROnCurrentPage(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); endPage(pdPage); } catch (TikaException|SAXException e) { throw new IOExceptionWithCause(e); diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index b031a72..d52f91a 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -131,7 +131,7 @@ public class PDFParserConfig implements Serializable { //content from elsewhere in the document. private boolean ifXFAExtractOnlyXFA = false; - private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR; + private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.AUTO; private int ocrDPI = 300; private ImageType ocrImageType = ImageType.GRAY; diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties index 28bbbac..e2c0c32 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties @@ -26,7 +26,7 @@ allowExtractionForAccessibility true ifXFAExtractOnlyXFA false catchIntermediateIOExceptions true #options: no_ocr, ocr_only, ocr_and_text_extraction, auto -ocrStrategy no_ocr +ocrStrategy auto #dots per inch for the ocr rendering of the page image ocrDPI 300 #if you request tif, make sure you have imageio jars on your classpath! diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index b094374..439a6ea 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -82,11 +82,19 @@ public class PDFParserTest extends TikaTest { public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"); public static final MediaType TYPE_DOC = MediaType.application("msword"); + private static ParseContext NO_OCR() { + PDFParserConfig config = new PDFParserConfig(); + config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + return context; + } @Test public void testXMLProfiler() throws Exception { //test that the xml profiler is not triggered by default - List<Metadata> metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf"); + List<Metadata> metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf", + NO_OCR()); assertEquals(1, metadataList.size()); //test that it is triggered when added to the default parser @@ -133,7 +141,8 @@ public class PDFParserTest extends TikaTest { @Test //TIKA-1374 public void testOSSpecificEmbeddedFileExtraction() throws Exception { - List<Metadata> metadatas = getRecursiveMetadata("testPDF_multiFormatEmbFiles.pdf"); + List<Metadata> metadatas = getRecursiveMetadata("testPDF_multiFormatEmbFiles.pdf", + NO_OCR()); assertEquals("metadata size", 5, metadatas.size()); assertEquals("file name", "Test.txt", metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); @@ -156,7 +165,7 @@ public class PDFParserTest extends TikaTest { docx */ - String content = getXML("testPDFEmbeddingAndEmbedded.docx").xml; + String content = getXML("testPDFEmbeddingAndEmbedded.docx", NO_OCR()).xml; int outerHaystack = content.indexOf("Outer_haystack"); int pdfHaystack = content.indexOf("pdf_haystack"); int needle = content.indexOf("Needle"); @@ -194,6 +203,7 @@ public class PDFParserTest extends TikaTest { PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); + config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); context.set(org.apache.tika.parser.Parser.class, p); @@ -219,53 +229,63 @@ public class PDFParserTest extends TikaTest { @Test public void testEmbeddedDocsWithOCROnly() throws Exception { assumeTrue("can run OCR", canRunOCR()); - + //test default is "auto" + assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, new PDFParserConfig().getOcrStrategy()); + testStrategy(null); + //now test other options for (PDFParserConfig.OCR_STRATEGY strategy : PDFParserConfig.OCR_STRATEGY.values()) { + testStrategy(strategy); + } + } + + private void testStrategy(PDFParserConfig.OCR_STRATEGY strategy) throws Exception { + //make sure everything works with regular xml _and_ with recursive + ParseContext context = new ParseContext(); + if (strategy != null) { PDFParserConfig config = new PDFParserConfig(); config.setOcrStrategy(strategy); - ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); - //make sure everything works with regular xml _and_ with recursive - XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context); - //can get dehaystack depending on version of tesseract and/or preprocessing - if (xmlResult.xml.contains("pdf_haystack") || xmlResult.xml.contains("dehaystack")) { - //great - } else { - fail("couldn't find pdf_haystack or its variants"); - } - assertContains("Haystack", xmlResult.xml); - assertContains("Needle", xmlResult.xml); - if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) { - // Tesseract may see the t in haystack as a ! some times... - //or it might see dehayslack... - //TODO: figure out how to make this test less hacky - String div = "<div class=\"ocr\">"; - if (xmlResult.xml.contains(div+"pdf_hays!ack")) { - } else if (xmlResult.xml.contains(div+"pdf_haystack")) { - } else if (xmlResult.xml.contains(div+"dehayslack")) { - } else { - fail("couldn't find acceptable variants of haystack"); - } + } + XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context); + + //can get dehaystack depending on version of tesseract and/or preprocessing + if (xmlResult.xml.contains("pdf_haystack") || xmlResult.xml.contains("dehaystack")) { + //great + } else { + fail("couldn't find pdf_haystack or its variants"); + } + assertContains("Haystack", xmlResult.xml); + assertContains("Needle", xmlResult.xml); + if (strategy == null || strategy != PDFParserConfig.OCR_STRATEGY.NO_OCR) { + // Tesseract may see the t in haystack as a ! some times... + //or it might see dehayslack... + //TODO: figure out how to make this test less hacky + String div = "<div class=\"ocr\">"; + if (xmlResult.xml.contains(div+"pdf_hays!ack")) { + } else if (xmlResult.xml.contains(div+"pdf_haystack")) { + } else if (xmlResult.xml.contains(div+"dehayslack")) { } else { - assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml); + fail("couldn't find acceptable variants of haystack"); } - assertEquals(4, getRecursiveMetadata("testPDFEmbeddingAndEmbedded.docx", context).size()); + } else { + assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml); } - + assertEquals(4, getRecursiveMetadata("testPDFEmbeddingAndEmbedded.docx", context).size()); } @Test public void testFileInAnnotationExtractedIfNoContents() throws Exception { //TIKA-2845 - List<Metadata> contents = getRecursiveMetadata("testPDFFileEmbInAnnotation_noContents.pdf"); + List<Metadata> contents = getRecursiveMetadata("testPDFFileEmbInAnnotation_noContents.pdf", + NO_OCR()); assertEquals(2, contents.size()); assertContains("This is a Excel", contents.get(1).get(RecursiveParserWrapperHandler.TIKA_CONTENT)); } @Test public void testEmbeddedFilesInAnnotations() throws Exception { - String xml = getXML("testPDFFileEmbInAnnotation.pdf").xml; + String xml = getXML("testPDFFileEmbInAnnotation.pdf", NO_OCR()).xml; assertTrue(xml.contains("This is a Excel")); } @@ -275,6 +295,7 @@ public class PDFParserTest extends TikaTest { //TIKA-1990, test that an embedded jpeg is correctly decoded PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); + config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); ParseContext context = new ParseContext(); context.set(PDFParserConfig.class, config); @@ -295,6 +316,7 @@ public class PDFParserTest extends TikaTest { PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); + config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); context.set(PDFParserConfig.class, config); @@ -332,6 +354,10 @@ public class PDFParserTest extends TikaTest { @Test public void testOCRAutoMode() throws Exception { assumeTrue("can run OCR", canRunOCR()); + + //default + assertContains("Happy New Year", getXML("testOCR.pdf").xml); + PDFParserConfig config = new PDFParserConfig(); config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO); ParseContext context = new ParseContext();
