This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new 4843ca1 TIKA-2286
4843ca1 is described below
commit 4843ca15798ce641218b2a3d1b46c37c308572db
Author: tballison <[email protected]>
AuthorDate: Wed Mar 1 09:22:13 2017 -0500
TIKA-2286
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 2 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 10 +++++++++
.../apache/tika/parser/pdf/PDFParserConfig.java | 24 ++++++++++++++++++++--
.../apache/tika/parser/pdf/tika-inline-config.xml | 4 +++-
4 files changed, 36 insertions(+), 4 deletions(-)
diff --git
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index f2904f1..0f0d103 100644
---
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -329,7 +329,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOCRImageFormatName(),
- os, config.getOCRDPI());
+ os, config.getOCRDPI(), config.getOcrImageQuality());
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
diff --git
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 56abac0..8f5205b 100644
---
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -638,6 +638,16 @@ public class PDFParser extends AbstractParser {
void setOcrDPI(int dpi) {
defaultConfig.setOcrDPI(dpi);
}
+
+ @Field
+ void setOcrImageQuality(float imageQuality) {
+ defaultConfig.setOcrImageQuality(imageQuality);
+ }
+
+ @Field
+ void setOcrImageFormatName(String formatName) {
+ defaultConfig.setOcrImageFormatName(formatName);
+ }
*/
void setExtractInlineImages(boolean extractInlineImages) {
defaultConfig.setExtractInlineImages(extractInlineImages);
diff --git
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 193513b..c6fe626 100644
---
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -106,6 +106,7 @@ public class PDFParserConfig implements Serializable {
private int ocrDPI = 300;
private ImageType ocrImageType = ImageType.GRAY;
private String ocrImageFormatName = "png";
+ private float ocrImageQuality = 1.0f;
private AccessChecker accessChecker;
@@ -318,7 +319,6 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractUniqueInlineImagesOnly(boolean
extractUniqueInlineImagesOnly) {
this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
-
}
/**
@@ -540,7 +540,9 @@ public class PDFParserConfig implements Serializable {
}
/**
- * Dots per inche used to render the page image for OCR
+ * Dots per inch used to render the page image for OCR.
+ * This does not apply to all image formats.
+ *
* @param ocrDPI
*/
public void setOCRDPI(int ocrDPI) {
@@ -557,6 +559,24 @@ public class PDFParserConfig implements Serializable {
}
/**
+ * Image quality used to render the page image for OCR.
+ * This does not apply to all image formats
+ * @return
+ */
+ public float getOcrImageQuality() {
+ return ocrImageQuality;
+ }
+
+ /**
+ * Image quality used to render the page image for OCR.
+ * This does not apply to all image formats
+ * @return
+ */
+ public void setOcrImageQuality(float ocrImageQuality) {
+ this.ocrImageQuality = ocrImageQuality;
+ }
+
+ /**
* Whether or not to extract PDActions from the file.
* Most Action types are handled inline; javascript macros
* are processed as embedded documents.
diff --git
a/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
b/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
index 9436604..61373f7 100644
---
a/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
+++
b/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
@@ -11,7 +11,9 @@
<param name="catchIntermediateExceptions"
type="bool">false</param>
<param name="extractUniqueInlineImagesOnly"
type="bool">false</param>
<param name="catchIntermediateExceptions"
type="bool">false</param>
- <param name="ocrDPI" type="int">314159</param>
+ <param name="ocrDPI" type="int">314</param>
+ <param name="ocrImageQuality" type="float">2.1</param>
+ <param name="ocrImageFormatName" type="string">jpeg</param>
<!-- we really should throw an exception for this!! -->
<param name="someRandomThingOrOther" type="bool">true</param>
</params>
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].