This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 849fc38 Fix for TIKA-2624 contributed by ewanmellor. (#232)
849fc38 is described below
commit 849fc385e626d904c50c3338d4668f6ab44bd72a
Author: Ewan Mellor <[email protected]>
AuthorDate: Tue Oct 22 14:33:20 2019 -0700
Fix for TIKA-2624 contributed by ewanmellor. (#232)
Change AbstractPDF2XHTML.doOCROnCurrentPage to use the same DPI value
(PDFParserConfig.ocrDPI) for both the PDF rendering and the image metadata.
Previously, the PDF was being rendered using ocrImageScale (default 2.0 ==
144dpi) and then putting ocrDPI (default 300) in the image metadata. Having
these two things be independent makes no sense, and is surely going to
confuse Tesseract when the image metadata does not match the data.
This change means that ocrDPI drives both values, and ocrImageScale is
removed. This also switches from PDFRenderer.renderImage to
PDFRenderer.renderImageWithDPI, but that's just a stub to make it clearer
what's going on.
This change will have the side-effect that the temporary images between the
PDF rendering and Tesseract will be 4x larger (144dpi to 300dpi). This will
have a memory and temporary disk space impact, but it will ensure that the
whole pipeline uses 300dpi by default. People who have memory constraints
will need to reduce ocrDPI and make the corresponding changes on the
Tesseract side.
---
.../org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 5 +++--
.../main/java/org/apache/tika/parser/pdf/PDFParser.java | 5 -----
.../java/org/apache/tika/parser/pdf/PDFParserConfig.java | 16 ----------------
.../org/apache/tika/parser/pdf/PDFParser.properties | 2 --
.../java/org/apache/tika/parser/pdf/PDFParserTest.java | 1 -
5 files changed, 3 insertions(+), 26 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index f17d289..32ed174 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -339,12 +339,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
TemporaryResources tmp = new TemporaryResources();
try {
- BufferedImage image = renderer.renderImage(pageIndex,
config.getOcrImageScale(), config.getOcrImageType());
+ int dpi = config.getOcrDPI();
+ BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcrImageType());
Path tmpFile = tmp.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
- os, config.getOcrDPI(), config.getOcrImageQuality());
+ os, dpi, config.getOcrImageQuality());
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 203cf12..e52311b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -450,11 +450,6 @@ public class PDFParser extends AbstractParser implements
Initializable {
defaultConfig.setOcrImageFormatName(formatName);
}
- @Field
- void setOcrImageScale(float imageScale) {
- defaultConfig.setOcrImageScale(imageScale);
- }
-
@Field
void setExtractBookmarksText(boolean extractBookmarksText) {
defaultConfig.setExtractBookmarksText(extractBookmarksText);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index b48acb8..382af86 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -126,7 +126,6 @@ public class PDFParserConfig implements Serializable {
private ImageType ocrImageType = ImageType.GRAY;
private String ocrImageFormatName = "png";
private float ocrImageQuality = 1.0f;
- private float ocrImageScale = 2.0f;
private AccessChecker accessChecker;
@@ -218,8 +217,6 @@ public class PDFParserConfig implements Serializable {
setOcrImageType(parseImageType(props.getProperty("ocrImageType")));
- setOcrImageScale(getFloatProp(props.getProperty("ocrImageScale"),
getOcrImageScale()));
-
setExtractActions(getBooleanProp(props.getProperty("extractActions"),
false));
setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
@@ -652,19 +649,6 @@ public class PDFParserConfig implements Serializable {
}
/**
- * Scale to use if rendering a page and then running OCR on that rendered
image.
- * Default is 2.0f.
- * @return
- */
- public float getOcrImageScale() {
- return ocrImageScale;
- }
-
- public void setOcrImageScale(float ocrImageScale) {
- this.ocrImageScale = ocrImageScale;
- }
-
- /**
* Whether or not to extract PDActions from the file.
* Most Action types are handled inline; javascript macros
* are processed as embedded documents.
diff --git
a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
index 1a12ad9..ca9d52b 100644
---
a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
+++
b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
@@ -33,8 +33,6 @@ ocrDPI 300
ocrImageFormatName png
#options: argb, binary, gray, rgb
ocrImageType gray
-#scale to use when rendering a page image for OCR
-ocrImageScale 2.0
# Use up to 500MB when loading a pdf into a PDDocument
maxMainMemoryBytes 524288000
#whether or not to set KCMS for faster (but legacy/unsupported) image rendering
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 63d56ba..55ba257 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1376,7 +1376,6 @@ public class PDFParserTest extends TikaTest {
assertEquals(false,
pdfParserConfig.getExtractUniqueInlineImagesOnly());
assertEquals(314, pdfParserConfig.getOcrDPI());
assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
- assertEquals(1.3f, pdfParserConfig.getOcrImageScale(), .01f);
assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName());
assertEquals(524288000, pdfParserConfig.getMaxMainMemoryBytes());
assertEquals(false,
pdfParserConfig.getCatchIntermediateIOExceptions());