This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8ec5d44 TIKA-3270 -- when rendering a page for OCR, do not include
electronic text (as default)
8ec5d44 is described below
commit 8ec5d4483c953f6024cc470e780037e01530d7dd
Author: tallison <[email protected]>
AuthorDate: Thu May 20 13:49:11 2021 -0400
TIKA-3270 -- when rendering a page for OCR, do not include electronic text
(as default)
---
CHANGES.txt | 1 +
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 5 +-
.../apache/tika/parser/pdf/NoTextPDFRenderer.java | 72 ++++++++++++++++++++++
.../java/org/apache/tika/parser/pdf/PDFParser.java | 5 ++
.../apache/tika/parser/pdf/PDFParserConfig.java | 64 ++++++++++++++++++-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 26 ++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 25 +++++++-
7 files changed, 193 insertions(+), 5 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 617894a..16682b2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -11,6 +11,7 @@ Release 2.0.0-ALPHA - 01/13/2021
* OCR is now triggered automatically for PDFs if tesseract
is on the user's path see
(https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr)
for how to disable OCR.
+ * By default, when rendering a page for OCR, the PDFParser does not
render glyphs/text.
* Removed deprecated Metadata keys/properties (TIKA-1974).
* Removed dangerous calls to read an inputstream or convert to bytes
without specifying a charset
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 3170885..3f47272 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -465,10 +465,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- PDFRenderer renderer = new PDFRenderer(pdDocument);
+ PDFRenderer renderer =
+ config.getOcrRenderingStrategy() ==
PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT ?
+ new NoTextPDFRenderer(pdDocument) : new
PDFRenderer(pdDocument);
try (TemporaryResources tmp = new TemporaryResources()) {
-
int dpi = config.getOcrDPI();
Path tmpFile = null;
try {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/NoTextPDFRenderer.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/NoTextPDFRenderer.java
new file mode 100644
index 0000000..c7874f4
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/NoTextPDFRenderer.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.rendering.PageDrawer;
+import org.apache.pdfbox.rendering.PageDrawerParameters;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
+
+/**
+ * This class extends the PDFRenderer to exclude rendering of electronic text.
+ */
+public class NoTextPDFRenderer extends PDFRenderer {
+
+ public NoTextPDFRenderer(PDDocument document) {
+ super(document);
+ }
+
+ /**
+ * Returns a new PageDrawer instance, using the given parameters. May be
overridden.
+ */
+ protected PageDrawer createPageDrawer(PageDrawerParameters parameters)
throws IOException {
+ PageDrawer pageDrawer = new NoTextPageDrawer(parameters);
+ pageDrawer.setAnnotationFilter(getAnnotationsFilter());
+ return pageDrawer;
+ }
+
+ private class NoTextPageDrawer extends PageDrawer {
+ public NoTextPageDrawer(PageDrawerParameters parameters) throws
IOException {
+ super(parameters);
+ }
+
+ @Override
+ public void beginText() throws IOException {
+ }
+
+ @Override
+ public void endText() throws IOException {
+ }
+
+
+ @Override
+ protected void showFontGlyph(Matrix textRenderingMatrix, PDFont font,
int code,
+ Vector displacement) throws IOException {
+ }
+
+ @Override
+ protected void showType3Glyph(Matrix textRenderingMatrix, PDType3Font
font, int code,
+ Vector displacement) throws IOException {
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index c7eebe9..d31fc97 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -480,6 +480,11 @@ public class PDFParser extends AbstractParser implements
Initializable {
}
@Field
+ public void setOcrRenderingStrategy(String ocrRenderingStrategy) {
+ defaultConfig.setOcrRenderingStrategy(ocrRenderingStrategy);
+ }
+
+ @Field
public void setOcrImageType(String imageType) {
defaultConfig.setOcrImageType(imageType);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index ad669e6..11aae38 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -96,6 +96,8 @@ public class PDFParserConfig implements Serializable {
private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.AUTO;
+ private OCR_RENDERING_STRATEGY ocrRenderingStrategy =
OCR_RENDERING_STRATEGY.NO_TEXT;
+
private int ocrDPI = 300;
private ImageType ocrImageType = ImageType.GRAY;
private String ocrImageFormatName = "png";
@@ -263,12 +265,23 @@ public class PDFParserConfig implements Serializable {
}
/**
- * If true, extract inline embedded OBXImages.
+ * If <code>true</code>, extract the literal inline embedded OBXImages.
+ * <p/>
* <b>Beware:</b> some PDF documents of modest size (~4MB) can contain
* thousands of embedded images totaling > 2.5 GB. Also, at least as
of PDFBox 1.8.5,
* there can be surprisingly large memory consumption and/or out of memory
errors.
- * Set to <code>true</code> with caution.
* <p/>
+ * Along the same lines, note that this does not extract "logical" images.
Some PDF writers
+ * break up a single logical image into hundreds of little images. With
this option set to
+ * <code>true</code>, you might get those hundreds of little images.
+ * logical image into
+ * <p/>
+ * NOTE ALSO: this extracts the raw images without clipping, rotation,
masks, color
+ * inverstion, etc. The images that this extracts may look nothing like
what a human
+ * would expect given the appearance of the PDF.
+ * <p/>
+ * Set to <code>true</code> only with the greatest caution.
+ *
* The default is <code>false</code>.
* <p/>
*
@@ -491,6 +504,25 @@ public class PDFParserConfig implements Serializable {
setOcrStrategy(OCR_STRATEGY.parse(ocrStrategyString));
}
+ public OCR_RENDERING_STRATEGY getOcrRenderingStrategy() {
+ return ocrRenderingStrategy;
+ }
+
+ public void setOcrRenderingStrategy(String ocrRenderingStrategyString) {
+
setOcrRenderingStrategy(OCR_RENDERING_STRATEGY.parse(ocrRenderingStrategyString));
+ }
+
+ /**
+ * When rendering the page for OCR, do you want to include the rendering
of the electronic text,
+ * ALL, or do you only want to run OCR on the images and vector graphics
(NO_TEXT)?
+ *
+ * @param ocrRenderingStrategy
+ */
+ public void setOcrRenderingStrategy(OCR_RENDERING_STRATEGY
ocrRenderingStrategy) {
+ this.ocrRenderingStrategy = ocrRenderingStrategy;
+ userConfigured.add("ocrRenderingStrategy");
+ }
+
/**
* String representation of the image format used to render
* the page image for OCR (examples: png, tiff, jpeg)
@@ -846,4 +878,32 @@ public class PDFParserConfig implements Serializable {
throw new IllegalArgumentException(sb.toString());
}
}
+
+ public enum OCR_RENDERING_STRATEGY {
+ NO_TEXT, ALL; //AUTO?
+ // Would TEXT_ONLY be useful in instances where the unicode mappings
+ // are corrupt/non-existent?
+
+ private static OCR_RENDERING_STRATEGY parse(String s) {
+ if (s == null) {
+ return NO_TEXT;
+ } else if ("no_text".equals(s.toLowerCase(Locale.ROOT))) {
+ return NO_TEXT;
+ } else if ("all".equals(s.toLowerCase(Locale.ROOT))) {
+ return ALL;
+ }
+ StringBuilder sb = new StringBuilder();
+ sb.append("I regret that I don't recognize '").append(s);
+ sb.append("' as an OCR_STRATEGY. I only recognize:");
+ int i = 0;
+ for (OCR_RENDERING_STRATEGY strategy :
OCR_RENDERING_STRATEGY.values()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(strategy.toString());
+
+ }
+ throw new IllegalArgumentException(sb.toString());
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 4ec104a..183ecbf 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -23,7 +23,12 @@ import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
+import java.awt.image.BufferedImage;
import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
@@ -36,7 +41,10 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Ignore;
@@ -1382,4 +1390,22 @@ public class PDFParserTest extends TikaTest {
metadata.set(TikaCoreProperties.TIKA_CONTENT,
contentHandler.toString());
return metadata;
}*/
+
+ @Test
+ public void oneOff() throws Exception {
+ Path p =
Paths.get("/home/tallison/Intellij/tika-main/tika-parsers/tika-parsers-standard"
+
+
"/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test"
+
+ "-documents/testPDF_XFA_govdocs1_258578.pdf");
+ p = Paths.get("/home/tallison/Downloads/tiger.pdf");
+ PDDocument pdDocument = PDDocument.load(p.toFile());
+ PDFRenderer renderer = new NoTextPDFRenderer(pdDocument);
+ Path target = Paths.get("/home/tallison/Desktop/tiger-no-text.png");
+ BufferedImage image = renderer.renderImageWithDPI(0, 300);
+ try (OutputStream os = Files.newOutputStream(target)) {
+ //TODO: get output format from TesseractConfig
+ ImageIOUtil.writeImage(image, "png", os, 300);
+ }
+ }
+
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index aec89d5..ca9b0c7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -253,7 +253,10 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
config.setOcrStrategy(strategy);
context.set(PDFParserConfig.class, config);
- }
+ };
+ PDFParserConfig config = context.get(PDFParserConfig.class, new
PDFParserConfig());
+
config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL);
+ context.set(PDFParserConfig.class, config);
XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx",
context);
//can get dehaystack depending on version of tesseract and/or
preprocessing
@@ -379,6 +382,26 @@ public class PDFParserTest extends TikaTest {
}
@Test
+ public void testOCRNoText() throws Exception {
+ assumeTrue("can run OCR", canRunOCR());
+ PDFParserConfig config = new PDFParserConfig();
+
config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL);
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(PDFParserConfig.class, config);
+ XMLResult xmlResult = getXML("testPDF_XFA_govdocs1_258578.pdf",
parseContext);
+ assertContains("PARK", xmlResult.xml);
+ assertContains("Applications", xmlResult.xml);
+
+
config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT);
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ parseContext.set(PDFParserConfig.class, config);
+ xmlResult = getXML("testPDF_XFA_govdocs1_258578.pdf", parseContext);
+ assertContains("NATIONAL", xmlResult.xml);
+ assertNotContained("Applications", xmlResult.xml);
+ }
+
+ @Test
public void testTesseractInitializationWorks() throws Exception {
//TIKA-2970 -- make sure that configurations set on the
TesseractOCRParser
//make it through to when the TesseractOCRParser is called via