This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 8346ea9 TIKA-2624 -- Rendering PDFs for OCR with Tesseract uses
different DPI than claimed via Ewan Mellor
8346ea9 is described below
commit 8346ea9ba10aa05399dce56e3ef19e92d4c5c6d9
Author: tallison <[email protected]>
AuthorDate: Tue Oct 22 17:51:06 2019 -0400
TIKA-2624 -- Rendering PDFs for OCR with Tesseract uses different DPI than
claimed via Ewan Mellor
---
CHANGES.txt | 4 ++++
.../java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 5 +++--
.../src/main/java/org/apache/tika/parser/pdf/PDFParser.java | 5 +++++
.../main/java/org/apache/tika/parser/pdf/PDFParserConfig.java | 10 +++++++++-
.../resources/org/apache/tika/parser/pdf/PDFParser.properties | 1 +
.../test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 1 -
6 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 5fa8ff7..861739c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
Release 1.23 - ??/??/???
+ * NOTE: The PDFParser now relies on OCRDPI to render page images when
+ users configure OCR on rendered page images. This will have the effect
+ of increasing rendered image size (TIKA-2624).
+
* Upgrade to POI 4.1.1 (TIKA-2851).
* Upgrade to PDFBox 2.0.17 (TIKA-2951).
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 0abe9ee..2fbdbd2 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -339,12 +339,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
TemporaryResources tmp = new TemporaryResources();
try {
- BufferedImage image = renderer.renderImage(pageIndex,
config.getOcrImageScale(), config.getOcrImageType());
+ int dpi = config.getOcrDPI();
+ BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcrImageType());
Path tmpFile = tmp.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
- os, config.getOcrDPI(), config.getOcrImageQuality());
+ os, dpi, config.getOcrImageQuality());
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index c4fe4b7..7af13db 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -452,6 +452,11 @@ public class PDFParser extends AbstractParser implements
Initializable {
defaultConfig.setOcrImageFormatName(formatName);
}
+ @Deprecated
+ /**
+ * @deprecated as of Tika 1.23, this is no longer used in rendering page
images for OCR;
+ * use {@link #setOcrDPI(int)}
+ */
@Field
void setOcrImageScale(float imageScale) {
defaultConfig.setOcrImageScale(imageScale);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index b48acb8..aafc176 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -126,6 +126,9 @@ public class PDFParserConfig implements Serializable {
private ImageType ocrImageType = ImageType.GRAY;
private String ocrImageFormatName = "png";
private float ocrImageQuality = 1.0f;
+ /**
+ * deprecated ... use OCRDPI instead
+ */
private float ocrImageScale = 2.0f;
private AccessChecker accessChecker;
@@ -654,12 +657,17 @@ public class PDFParserConfig implements Serializable {
/**
* Scale to use if rendering a page and then running OCR on that rendered
image.
* Default is 2.0f.
- * @return
+ * @deprecated as of Tika 1.23, this is no longer used in rendering page
images; use {@link #setOcrDPI(int)}
*/
public float getOcrImageScale() {
return ocrImageScale;
}
+ /**
+ *
+ * @param ocrImageScale
+ * @deprecated (as of Tika 1.23, this is no longer used in rendering page
images)
+ */
public void setOcrImageScale(float ocrImageScale) {
this.ocrImageScale = ocrImageScale;
}
diff --git
a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
index 1a12ad9..739aa57 100644
---
a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
+++
b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
@@ -34,6 +34,7 @@ ocrImageFormatName png
#options: argb, binary, gray, rgb
ocrImageType gray
#scale to use when rendering a page image for OCR
+#as of Tika 1.23, this is no longer used; use ocrDPI instead
ocrImageScale 2.0
# Use up to 500MB when loading a pdf into a PDDocument
maxMainMemoryBytes 524288000
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 9fc3b67..0eb8df5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1391,7 +1391,6 @@ public class PDFParserTest extends TikaTest {
assertEquals(false,
pdfParserConfig.getExtractUniqueInlineImagesOnly());
assertEquals(314, pdfParserConfig.getOcrDPI());
assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
- assertEquals(1.3f, pdfParserConfig.getOcrImageScale(), .01f);
assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName());
assertEquals(524288000, pdfParserConfig.getMaxMainMemoryBytes());
assertEquals(false,
pdfParserConfig.getCatchIntermediateIOExceptions());