This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 48257e37ce TIKA-4748 -- clean up ocr configuration within pdfparser
(#2864)
48257e37ce is described below
commit 48257e37cefb053337f92cda8ade33f0408d6006
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jun 4 05:47:04 2026 -0400
TIKA-4748 -- clean up ocr configuration within pdfparser (#2864)
---
.../advanced/integration-testing/tika-app.adoc | 4 +-
.../advanced/integration-testing/tika-server.adoc | 2 +-
.../ROOT/pages/developers/serialization.adoc | 6 +-
.../migration-to-4x/migrating-tika-server-4x.adoc | 2 +-
.../pages/migration-to-4x/migrating-to-4x.adoc | 4 +
.../ROOT/pages/using-tika/server/index.adoc | 2 +-
.../apache/tika/cli/XmlToJsonConfigConverter.java | 57 ++++++++++
.../tika/cli/XmlToJsonConfigConverterTest.java | 59 ++++++++++
.../java/org/apache/tika/parser/ParseContext.java | 2 +-
.../tika/inference/OpenAIImageEmbeddingParser.java | 2 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 2 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 40 +++----
.../configs/tika-config-non-primitives.json | 6 +-
.../resources/configs/tika-config-ocr-for-pdf.json | 4 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 30 +++---
.../java/org/apache/tika/parser/pdf/PDFParser.java | 10 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 119 +--------------------
.../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 6 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 10 +-
.../apache/tika/parser/pdf/tika-inline-config.json | 8 +-
.../apache/tika/config/loader/TikaJsonConfig.java | 4 +-
.../org/apache/tika/config/loader/TikaLoader.java | 4 +-
.../org/apache/tika/serialization/TikaModule.java | 2 +-
.../tika/config/loader/TikaJsonConfigTest.java | 5 +-
.../serialization/RoundTripSerializationTest.java | 10 +-
.../TestParseContextSerialization.java | 17 +--
.../resources/configs/example-tika-config.json | 4 +-
.../customocr/tika-config-rendered.json | 10 +-
.../config-examples/server-with-parsers.json | 4 +-
.../tika/server/standard/TikaResourceTest.java | 16 ++-
.../tika/server/standard/UnpackerResourceTest.java | 8 +-
.../standard/UnpackerResourceWithConfigTest.java | 12 ++-
32 files changed, 258 insertions(+), 213 deletions(-)
diff --git a/docs/modules/ROOT/pages/advanced/integration-testing/tika-app.adoc
b/docs/modules/ROOT/pages/advanced/integration-testing/tika-app.adoc
index 12cbbe3f7b..b5ea77381d 100644
--- a/docs/modules/ROOT/pages/advanced/integration-testing/tika-app.adoc
+++ b/docs/modules/ROOT/pages/advanced/integration-testing/tika-app.adoc
@@ -255,7 +255,9 @@ Create `/tmp/tika-app-test/my-config.json`:
"pdf-parser": {
"extractActions": true,
"extractInlineImages": true,
- "ocrStrategy": "NO_OCR"
+ "ocr": {
+ "strategy": "NO_OCR"
+ }
}
},
{
diff --git
a/docs/modules/ROOT/pages/advanced/integration-testing/tika-server.adoc
b/docs/modules/ROOT/pages/advanced/integration-testing/tika-server.adoc
index 7088ab7e8a..2608a6b490 100644
--- a/docs/modules/ROOT/pages/advanced/integration-testing/tika-server.adoc
+++ b/docs/modules/ROOT/pages/advanced/integration-testing/tika-server.adoc
@@ -255,7 +255,7 @@ curl -s -X POST -F "[email protected]" -H "Accept:
application/json" http://loca
[source,bash]
----
curl -s -X POST -F "[email protected]" \
- -F 'config={"parsers":[{"pdf-parser":{"ocrStrategy":"NO_OCR"}}]}' \
+ -F 'config={"parsers":[{"pdf-parser":{"ocr":{"strategy":"NO_OCR"}}}]}' \
-H "Accept: application/json" \
http://localhost:9998/meta/config
----
diff --git a/docs/modules/ROOT/pages/developers/serialization.adoc
b/docs/modules/ROOT/pages/developers/serialization.adoc
index b68435d939..a5adc3e0e5 100644
--- a/docs/modules/ROOT/pages/developers/serialization.adoc
+++ b/docs/modules/ROOT/pages/developers/serialization.adoc
@@ -55,7 +55,7 @@ Tika uses a compact format for component configuration:
Components can be specified as:
* **String**: `"pdf-parser"` - creates instance with defaults
-* **Object**: `{"pdf-parser": {"ocrStrategy": "AUTO"}}` - creates configured
instance
+* **Object**: `{"pdf-parser": {"ocr": {"strategy": "AUTO"}}}` - creates
configured instance
== The @TikaComponent Annotation
@@ -214,7 +214,9 @@ Benefits:
{
"parse-context": {
"pdf-parser": {
- "ocrStrategy": "AUTO",
+ "ocr": {
+ "strategy": "AUTO"
+ },
"extractInlineImages": true
},
"commons-digester-factory": {
diff --git
a/docs/modules/ROOT/pages/migration-to-4x/migrating-tika-server-4x.adoc
b/docs/modules/ROOT/pages/migration-to-4x/migrating-tika-server-4x.adoc
index 2c67139dda..bd30d5a873 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-tika-server-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-tika-server-4x.adoc
@@ -64,7 +64,7 @@ POST endpoints accept multipart requests with a `file` part
and optional `config
# Parse with custom PDF parser settings
curl -X POST http://localhost:9998/tika/json \
-F "[email protected]" \
- -F
"config={\"pdf-parser\":{\"ocrStrategy\":\"no_ocr\"}};type=application/json"
+ -F
"config={\"pdf-parser\":{\"ocr\":{\"strategy\":\"no_ocr\"}}};type=application/json"
----
== Breaking Changes
diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
index a3dc92901f..1aed5c00fa 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
@@ -64,6 +64,10 @@ The converter currently supports:
* **Special handling** - TesseractOCR's `otherTesseractSettings` list is
automatically
converted to the `otherTesseractConfig` map format
+IMPORTANT: The converter is a starting point, not a complete translation. It
handles only
+the `parsers` section, and some 3.x options were genuinely removed or
restructured in 4.x
+with no mechanical equivalent. Review the generated JSON and confirm it loads
before relying on it.
+
=== Example Conversion
**XML Format (3.x):**
diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc
b/docs/modules/ROOT/pages/using-tika/server/index.adoc
index 811976aea9..f9edd5f9d1 100644
--- a/docs/modules/ROOT/pages/using-tika/server/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc
@@ -99,7 +99,7 @@ POST with multipart for custom per-request configuration:
----
curl -X POST http://localhost:9998/tika/json \
-F "[email protected]" \
- -F
"config={\"pdf-parser\":{\"ocrStrategy\":\"no_ocr\"}};type=application/json"
+ -F
"config={\"pdf-parser\":{\"ocr\":{\"strategy\":\"no_ocr\"}}};type=application/json"
----
Valid handler paths under `/tika/`: `text`, `html`, `xml`, `md`, `json`. For
diff --git
a/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
index fc43b553d5..6a9457cc05 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
@@ -26,6 +26,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
@@ -52,6 +53,11 @@ import org.apache.tika.utils.XMLReaderUtils;
* Currently supports converting the "parsers" section of tika-config.xml files
* for parsers in the tika-parsers-standard module.
* <p>
+ * <strong>Best-effort starting point, not a complete translation.</strong>
Only the
+ * {@code parsers} section is converted, and some 3.x options were genuinely
removed or
+ * restructured in 4.x with no mechanical equivalent. Review the generated
JSON before
+ * relying on it.
+ * <p>
* Supports parameter types: bool, int, long, double, float, string, list, and
map.
* <p>
* <strong>Special Case:</strong> TesseractOCR's {@code
otherTesseractSettings} list
@@ -367,11 +373,62 @@ public class XmlToJsonConfigConverter {
config.put("exclude", excludes);
}
+ if ("pdf-parser".equals(componentName)) {
+ // 4.x PDFParserConfig groups OCR settings under a nested "ocr"
object
+ // (OcrConfig); the legacy flat ocr* keys were removed.
+ nestOcrParams(config);
+ }
+
Map<String, Object> result = new LinkedHashMap<>();
result.put(componentName, config);
return result;
}
+ // Maps the legacy flat PDFParser ocr* params to their nested OcrConfig
keys.
+ private static final Map<String, String> OCR_PARAM_TO_NESTED_KEY =
Map.ofEntries(
+ Map.entry("ocrStrategy", "strategy"),
+ Map.entry("ocrStrategyAuto", "strategyAuto"),
+ Map.entry("ocrRenderingStrategy", "renderingStrategy"),
+ Map.entry("ocrImageFormat", "imageFormat"),
+ Map.entry("ocrImageType", "imageType"),
+ Map.entry("ocrDPI", "dpi"),
+ Map.entry("ocrImageQuality", "imageQuality"),
+ Map.entry("ocrMaxImagePixels", "maxImagePixels"),
+ Map.entry("ocrMaxPagesToOcr", "maxPagesToOcr"));
+
+ /**
+ * Moves the legacy flat {@code ocr*} PDFParser params (e.g. {@code
ocrStrategy},
+ * {@code ocrDPI}) into the nested {@code "ocr"} object used by 4.x
+ * {@code PDFParserConfig} ({@code OcrConfig}). The flat {@code ocr*} JSON
keys were
+ * removed in 4.x, so a verbatim copy would no longer load.
+ */
+ private static void nestOcrParams(Map<String, Object> config) {
+ Map<String, Object> ocr = new LinkedHashMap<>();
+ // Seed from an explicitly-configured nested "ocr" map (<param
name="ocr" type="map">)
+ // so those values win; legacy flat ocr* params only fill keys it
doesn't supply.
+ Object existingOcr = config.get("ocr");
+ if (existingOcr instanceof Map<?, ?> existingMap) {
+ for (Map.Entry<?, ?> e : existingMap.entrySet()) {
+ if (e.getKey() instanceof String k) {
+ ocr.put(k, e.getValue());
+ }
+ }
+ }
+
+ Iterator<Map.Entry<String, Object>> it = config.entrySet().iterator();
+ while (it.hasNext()) {
+ Map.Entry<String, Object> entry = it.next();
+ String nestedKey = OCR_PARAM_TO_NESTED_KEY.get(entry.getKey());
+ if (nestedKey != null) {
+ ocr.putIfAbsent(nestedKey, entry.getValue());
+ it.remove();
+ }
+ }
+ if (!ocr.isEmpty()) {
+ config.put("ocr", ocr);
+ }
+ }
+
/**
* Converts a <params> element to a map of parameter names to values.
*/
diff --git
a/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
index ec138fb884..0ba74c6218 100644
---
a/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
+++
b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.cli;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -27,6 +28,8 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
@@ -316,4 +319,60 @@ public class XmlToJsonConfigConverterTest {
assertNotNull(parser);
assertTrue(parser instanceof CompositeParser);
}
+
+ @Test
+ public void testOcrNestedMapMergedWithLegacyFlatParams(@TempDir Path
tempDir) throws Exception {
+ // A pdf-parser config that carries BOTH an explicit nested "ocr" map
and
+ // legacy flat ocr* params. nestOcrParams must merge them into a single
+ // "ocr" object: explicitly-nested values win, and flat params only
fill
+ // the keys the nested map doesn't supply (TIKA-4748 review follow-up).
+ String xmlConfig = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+ "<properties>\n" +
+ " <parsers>\n" +
+ " <parser
class=\"org.apache.tika.parser.pdf.PDFParser\">\n" +
+ " <params>\n" +
+ " <param name=\"ocr\" type=\"map\">\n" +
+ " <strategy>NO_OCR</strategy>\n" +
+ " </param>\n" +
+ " <param name=\"ocrStrategy\"
type=\"string\">OCR_AND_TEXT_EXTRACTION</param>\n" +
+ " <param name=\"ocrDPI\"
type=\"int\">200</param>\n" +
+ " </params>\n" +
+ " </parser>\n" +
+ " </parsers>\n" +
+ "</properties>";
+
+ Path xmlPath = tempDir.resolve("pdf-ocr-merge.xml");
+ Path jsonPath = tempDir.resolve("pdf-ocr-merge.json");
+ Files.write(xmlPath, xmlConfig.getBytes(StandardCharsets.UTF_8));
+
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Locate the pdf-parser config object within the parsers list.
+ JsonNode root = new ObjectMapper().readTree(jsonPath.toFile());
+ JsonNode pdf = null;
+ for (JsonNode entry : root.get("parsers")) {
+ if (entry.has("pdf-parser")) {
+ pdf = entry.get("pdf-parser");
+ break;
+ }
+ }
+ assertNotNull(pdf, "pdf-parser entry should be present");
+
+ // Legacy flat ocr* keys must have moved out of the top-level parser
config...
+ assertFalse(pdf.has("ocrStrategy"), "flat ocrStrategy should be
nested, not left at top level");
+ assertFalse(pdf.has("ocrDPI"), "flat ocrDPI should be nested, not left
at top level");
+
+ // ...and merged into the single nested "ocr" object.
+ JsonNode ocr = pdf.get("ocr");
+ assertNotNull(ocr, "merged nested ocr object should be present");
+ assertEquals("NO_OCR", ocr.get("strategy").asText(),
+ "explicit nested strategy must win over the legacy flat
ocrStrategy");
+ assertEquals(200, ocr.get("dpi").asInt(),
+ "legacy flat ocrDPI must be migrated into ocr.dpi");
+
+ // The merged config must still load.
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index 8d0094de11..037816ea17 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -110,7 +110,7 @@ public class ParseContext implements Serializable {
* <p>
* Example:
* <pre>
- * parseContext.setJsonConfig("pdf-parser", () -> "{\"ocrStrategy\":
\"AUTO\"}");
+ * parseContext.setJsonConfig("pdf-parser", () -> "{\"ocr\":
{\"strategy\": \"AUTO\"}}");
* parseContext.setJsonConfig("handler-config", () -> "{\"type\":
\"XML\"}");
* </pre>
*
diff --git
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
index b186d5465e..f23b55f161 100644
---
a/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
+++
b/tika-parsers/tika-parsers-ml/tika-inference/src/main/java/org/apache/tika/inference/OpenAIImageEmbeddingParser.java
@@ -64,7 +64,7 @@ import org.apache.tika.utils.StringUtils;
* <p>
* This parser registers for the same {@code image/ocr-*} media types
* used by the PDF renderer's OCR pipeline, so it slots into the
- * existing {@code ocrStrategy} mechanism. When configured, each
+ * existing {@code ocr.strategy} mechanism. When configured, each
* rendered page image is sent to the embedding endpoint and the
* vector is stored as a serialized {@link Chunk} with a
* {@link PaginatedLocator} (when page number metadata is available).
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
index 56c9f29572..8a52046a51 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
@@ -36,7 +36,7 @@ public class TSDParserTest extends TikaTest {
public void testBrokenPdf() throws Exception {
ParseContext parseContext = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
+ config.getOcr().setStrategy(OcrConfig.Strategy.NO_OCR);
parseContext.set(PDFParserConfig.class, config);
//make sure that embedded file appears in list
//and make sure embedded exception is recorded
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index c8a116175c..77c7ee569f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -113,7 +113,7 @@ public class PDFParserTest extends TikaTest {
private static ParseContext NO_OCR() {
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
+ config.getOcr().setStrategy(OcrConfig.Strategy.NO_OCR);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
return context;
@@ -230,7 +230,7 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
- config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
+ config.getOcr().setStrategy(OcrConfig.Strategy.NO_OCR);
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
context.set(org.apache.tika.parser.Parser.class, p);
@@ -260,7 +260,7 @@ public class PDFParserTest extends TikaTest {
public void testEmbeddedDocsWithOCROnly() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
//test default is "auto"
- assertEquals(OcrConfig.Strategy.AUTO, new
PDFParserConfig().getOcrStrategy());
+ assertEquals(OcrConfig.Strategy.AUTO, new
PDFParserConfig().getOcr().getStrategy());
testStrategy(null);
//now test other options
for (OcrConfig.Strategy strategy : OcrConfig.Strategy.values()) {
@@ -273,11 +273,11 @@ public class PDFParserTest extends TikaTest {
ParseContext context = new ParseContext();
if (strategy != null) {
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(strategy);
+ config.getOcr().setStrategy(strategy);
context.set(PDFParserConfig.class, config);
};
PDFParserConfig config = context.get(PDFParserConfig.class, new
PDFParserConfig());
- config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.ALL);
+ config.getOcr().setRenderingStrategy(OcrConfig.RenderingStrategy.ALL);
context.set(PDFParserConfig.class, config);
XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx",
context);
@@ -328,7 +328,7 @@ public class PDFParserTest extends TikaTest {
//TIKA-1990, test that an embedded jpeg is correctly decoded
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
- config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
+ config.getOcr().setStrategy(OcrConfig.Strategy.NO_OCR);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
@@ -349,7 +349,7 @@ public class PDFParserTest extends TikaTest {
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
- config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
+ config.getOcr().setStrategy(OcrConfig.Strategy.NO_OCR);
context.set(PDFParserConfig.class, config);
@@ -376,7 +376,7 @@ public class PDFParserTest extends TikaTest {
public void testJBIG2OCROnly() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY);
+ config.getOcr().setStrategy(OcrConfig.Strategy.OCR_ONLY);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
//make sure everything works with regular xml _and_ with recursive
@@ -388,7 +388,7 @@ public class PDFParserTest extends TikaTest {
public void testJPEG2000() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY);
+ config.getOcr().setStrategy(OcrConfig.Strategy.OCR_ONLY);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
//make sure everything works with regular xml _and_ with recursive
@@ -404,13 +404,13 @@ public class PDFParserTest extends TikaTest {
assertContains("Happy New Year", getXML("testOCR.pdf").xml);
PDFParserConfig config = new PDFParserConfig();
- config.setOcrStrategy(OcrConfig.Strategy.AUTO);
+ config.getOcr().setStrategy(OcrConfig.Strategy.AUTO);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
XMLResult xmlResult = getXML("testOCR.pdf", context);
assertContains("Happy New Year", xmlResult.xml);
- config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
+ config.getOcr().setStrategy(OcrConfig.Strategy.NO_OCR);
String txt = getText("testOCR.pdf", new Metadata(), context);
assertEquals("", txt.trim());
}
@@ -419,16 +419,16 @@ public class PDFParserTest extends TikaTest {
public void testOCRNoText() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
- config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.ALL);
- config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY);
+ config.getOcr().setRenderingStrategy(OcrConfig.RenderingStrategy.ALL);
+ config.getOcr().setStrategy(OcrConfig.Strategy.OCR_ONLY);
ParseContext parseContext = new ParseContext();
parseContext.set(PDFParserConfig.class, config);
XMLResult xmlResult = getXML("testPDF_XFA_govdocs1_258578.pdf",
parseContext);
assertContains("PARK", xmlResult.xml);
assertContains("Applications", xmlResult.xml);
- config.setOcrRenderingStrategy(OcrConfig.RenderingStrategy.NO_TEXT);
- config.setOcrStrategy(OcrConfig.Strategy.OCR_ONLY);
+
config.getOcr().setRenderingStrategy(OcrConfig.RenderingStrategy.NO_TEXT);
+ config.getOcr().setStrategy(OcrConfig.Strategy.OCR_ONLY);
parseContext.set(PDFParserConfig.class, config);
xmlResult = getXML("testPDF_XFA_govdocs1_258578.pdf", parseContext);
assertContains("NATIONAL", xmlResult.xml);
@@ -578,7 +578,7 @@ public class PDFParserTest extends TikaTest {
public void testPDFParserConfigSerialization() throws Exception {
// PDFParser is self-configuring: config goes via "pdf-parser" JSON
config path
String json = "{\"pdf-parser\": {\"sortByPosition\": true, " +
- "\"extractInlineImages\": true, \"ocrStrategy\": \"AUTO\"}}";
+ "\"extractInlineImages\": true, \"ocr\": {\"strategy\":
\"AUTO\"}}}";
com.fasterxml.jackson.databind.ObjectMapper mapper =
TikaObjectMapperFactory.getMapper();
ParseContext deserialized = mapper.readValue(json, ParseContext.class);
@@ -596,8 +596,8 @@ public class PDFParserTest extends TikaTest {
"sortByPosition should be preserved");
assertTrue(deserializedConfig.isExtractInlineImages(),
"extractInlineImages should be preserved");
- assertEquals(OcrConfig.Strategy.AUTO,
deserializedConfig.getOcrStrategy(),
- "ocrStrategy should be preserved");
+ assertEquals(OcrConfig.Strategy.AUTO,
deserializedConfig.getOcr().getStrategy(),
+ "ocr.strategy should be preserved");
}
@Test
@@ -651,9 +651,9 @@ public class PDFParserTest extends TikaTest {
assertEquals("org.apache.tika.parser.pdf.PDFParser",
pdfParser.getClass().getName());
assertEquals(OcrConfig.Strategy.OCR_ONLY,
- ((PDFParser) pdfParser).getPDFParserConfig().getOcrStrategy());
+ ((PDFParser)
pdfParser).getPDFParserConfig().getOcr().getStrategy());
assertEquals(OcrConfig.ImageType.RGB,
- ((PDFParser)
pdfParser).getPDFParserConfig().getOcrImageType());
+ ((PDFParser)
pdfParser).getPDFParserConfig().getOcr().getImageType());
}
private ParseContext configureRenderingParseContext() {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-non-primitives.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-non-primitives.json
index 8986211948..55ed4ce590 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-non-primitives.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-non-primitives.json
@@ -3,8 +3,10 @@
{
"pdf-parser": {
"sortByPosition": true,
- "ocrImageType": "RGB",
- "ocrStrategy": "OCR_ONLY"
+ "ocr": {
+ "imageType": "RGB",
+ "strategy": "OCR_ONLY"
+ }
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-ocr-for-pdf.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-ocr-for-pdf.json
index e217e1c934..11d68e46c5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-ocr-for-pdf.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-ocr-for-pdf.json
@@ -11,7 +11,9 @@
{
"pdf-parser": {
"extractInlineImages": false,
- "ocrStrategy": "OCR_ONLY"
+ "ocr": {
+ "strategy": "OCR_ONLY"
+ }
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index a361521021..83891fcb84 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -209,7 +209,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
this.config = config;
this.renderer = renderer;
embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
- if (config.getOcrStrategy() == NO_OCR) {
+ if (config.getOcr().getStrategy() == NO_OCR) {
ocrParser = null;
} else {
ocrParser = EmbeddedDocumentUtil.getStatelessParser(context);
@@ -557,11 +557,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
// Enforce maxPagesToOcr limit
- int maxPagesToOcr = config.getOcrMaxPagesToOcr();
+ int maxPagesToOcr = config.getOcr().getMaxPagesToOcr();
if (maxPagesToOcr > 0 && c != null && c.getCount() > maxPagesToOcr) {
return;
}
- MediaType ocrImageMediaType = MediaType.image("ocr-" +
config.getOcrImageFormat().getFormatName());
+ MediaType ocrImageMediaType = MediaType.image("ocr-" +
config.getOcr().getImageFormat().getFormatName());
if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType))
{
if (ocrStrategy == OCR_ONLY || ocrStrategy ==
OCR_AND_TEXT_EXTRACTION) {
throw new TikaException(
@@ -624,7 +624,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
Renderer thisRenderer = getPDFRenderer(renderer);
//if there's a configured renderer and if the rendering strategy is
"all"
if (thisRenderer != null &&
- config.getOcrRenderingStrategy() ==
OcrConfig.RenderingStrategy.ALL) {
+ config.getOcr().getRenderingStrategy() ==
OcrConfig.RenderingStrategy.ALL) {
PageRangeRequest pageRangeRequest =
new PageRangeRequest(getCurrentPageNo(),
getCurrentPageNo());
if (thisRenderer instanceof PDDocumentRenderer) {
@@ -673,7 +673,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
TemporaryResources
tmpResources)
throws IOException, TikaException {
PDFRenderer renderer = null;
- switch (config.getOcrRenderingStrategy()) {
+ switch (config.getOcr().getRenderingStrategy()) {
case NO_TEXT:
renderer = new NoTextPDFRenderer(pdDocument);
break;
@@ -688,7 +688,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
break;
}
- int dpi = config.getOcrDPI();
+ int dpi = config.getOcr().getDpi();
Path tmpFile = null;
RenderingTracker renderingTracker =
parseContext.get(RenderingTracker.class);
@@ -701,7 +701,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try {
// Check estimated pixel dimensions before rendering to
// prevent OOM on pathologically large pages
- long maxPixels = config.getOcrMaxImagePixels();
+ long maxPixels = config.getOcr().getMaxImagePixels();
if (maxPixels > 0) {
PDPage currentPage = pdDocument.getPage(pageIndex);
PDRectangle mediaBox = currentPage.getMediaBox();
@@ -720,14 +720,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
BufferedImage image =
- renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcrImageType().getPdfBoxImageType());
+ renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcr().getImageType().getPdfBoxImageType());
//TODO -- get suffix based on OcrImageType
tmpFile = tmpResources.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
- ImageIOUtil.writeImage(image,
config.getOcrImageFormat().getFormatName(), os, dpi,
- config.getOcrImageQuality());
+ ImageIOUtil.writeImage(image,
config.getOcr().getImageFormat().getFormatName(), os, dpi,
+ config.getOcr().getImageQuality());
}
} catch (SecurityException e) {
//throw SecurityExceptions immediately
@@ -754,21 +754,21 @@ class AbstractPDF2XHTML extends PDFTextStripper {
for (PDAnnotation annotation : page.getAnnotations()) {
processPageAnnotation(annotation);
}
- if (config.getOcrStrategy() == OCR_AND_TEXT_EXTRACTION) {
+ if (config.getOcr().getStrategy() == OCR_AND_TEXT_EXTRACTION) {
doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION);
- } else if (config.getOcrStrategy() == AUTO) {
+ } else if (config.getOcr().getStrategy() == AUTO) {
boolean unmappedExceedsLimit = false;
- if (totalCharsPerPage >
config.getOcrStrategyAuto().getTotalCharsPerPage()) {
+ if (totalCharsPerPage >
config.getOcr().getStrategyAuto().getTotalCharsPerPage()) {
// There are enough characters to not have to do OCR.
Check number of unmapped characters
final float percentUnmapped =
(float) unmappedUnicodeCharsPerPage /
totalCharsPerPage;
final float unmappedCharacterLimit =
-
config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
+
config.getOcr().getStrategyAuto().getUnmappedUnicodeCharsPerPage();
unmappedExceedsLimit = (unmappedCharacterLimit < 1) ?
percentUnmapped > unmappedCharacterLimit :
unmappedUnicodeCharsPerPage >
unmappedCharacterLimit;
}
- if (totalCharsPerPage <=
config.getOcrStrategyAuto().getTotalCharsPerPage() ||
+ if (totalCharsPerPage <=
config.getOcr().getStrategyAuto().getTotalCharsPerPage() ||
unmappedExceedsLimit) {
doOCROnCurrentPage(page, AUTO);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index b3824a621c..f4e734f532 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -213,7 +213,7 @@ public class PDFParser implements Parser, RenderingParser {
if (handler != null) {
if (shouldHandleXFAOnly(hasXFA, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
- } else if (localConfig.getOcrStrategy()
+ } else if (localConfig.getOcr().getStrategy()
.equals(OcrConfig.Strategy.OCR_ONLY)) {
OCR2XHTML.process(pdfDocument, handler, context, metadata,
localConfig, renderer);
@@ -432,7 +432,7 @@ public class PDFParser implements Parser, RenderingParser {
return true;
}
- if (localConfig.getOcrStrategy() == OcrConfig.Strategy.NO_OCR) {
+ if (localConfig.getOcr().getStrategy() == OcrConfig.Strategy.NO_OCR) {
return false;
}
//TODO: test that this is not AUTO with no OCR parser installed
@@ -751,9 +751,9 @@ public class PDFParser implements Parser, RenderingParser {
}
//set a default renderer if nothing was defined
PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer();
- pdfBoxRenderer.setDPI(config.getOcrDPI());
-
pdfBoxRenderer.setImageType(config.getOcrImageType().getPdfBoxImageType());
-
pdfBoxRenderer.setImageFormatName(config.getOcrImageFormat().getFormatName());
+ pdfBoxRenderer.setDPI(config.getOcr().getDpi());
+
pdfBoxRenderer.setImageType(config.getOcr().getImageType().getPdfBoxImageType());
+
pdfBoxRenderer.setImageFormatName(config.getOcr().getImageFormat().getFormatName());
this.renderer = pdfBoxRenderer;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 96b3faa572..b83f449cad 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -522,122 +522,9 @@ public class PDFParserConfig implements Serializable {
this.ocr = ocr;
}
- /**
- * @return strategy to use for OCR
- */
- public OcrConfig.Strategy getOcrStrategy() {
- return ocr.getStrategy();
- }
-
- /**
- * @return ocr auto strategy to use when ocr_strategy = Auto
- */
- public OcrConfig.StrategyAuto getOcrStrategyAuto() {
- return ocr.getStrategyAuto();
- }
-
- /**
- * Which strategy to use for OCR
- */
- public void setOcrStrategy(OcrConfig.Strategy ocrStrategy) {
- ocr.setStrategy(ocrStrategy);
- }
-
- /**
- * Sets the OCR strategy auto configuration.
- */
- public void setOcrStrategyAuto(OcrConfig.StrategyAuto ocrStrategyAuto) {
- ocr.setStrategyAuto(ocrStrategyAuto);
- }
-
- public OcrConfig.RenderingStrategy getOcrRenderingStrategy() {
- return ocr.getRenderingStrategy();
- }
-
- /**
- * When rendering the page for OCR, do you want to include the rendering
of the electronic text,
- * ALL, or do you only want to run OCR on the images and vector graphics
(NO_TEXT)?
- */
- public void setOcrRenderingStrategy(OcrConfig.RenderingStrategy
ocrRenderingStrategy) {
- ocr.setRenderingStrategy(ocrRenderingStrategy);
- }
-
- public OcrConfig.ImageFormat getOcrImageFormat() {
- return ocr.getImageFormat();
- }
-
- public void setOcrImageFormat(OcrConfig.ImageFormat ocrImageFormat) {
- ocr.setImageFormat(ocrImageFormat);
- }
-
- public OcrConfig.ImageType getOcrImageType() {
- return ocr.getImageType();
- }
-
- public void setOcrImageType(OcrConfig.ImageType ocrImageType) {
- ocr.setImageType(ocrImageType);
- }
-
- /**
- * @return dots per inch used to render the page image for OCR
- */
- public int getOcrDPI() {
- return ocr.getDpi();
- }
-
- /**
- * Dots per inch used to render the page image for OCR.
- */
- public void setOcrDPI(int ocrDPI) {
- ocr.setDpi(ocrDPI);
- }
-
- /**
- * @return image quality used to render the page image for OCR
- */
- public float getOcrImageQuality() {
- return ocr.getImageQuality();
- }
-
- /**
- * Image quality used to render the page image for OCR.
- */
- public void setOcrImageQuality(float ocrImageQuality) {
- ocr.setImageQuality(ocrImageQuality);
- }
-
- /**
- * @return maximum total pixels (width × height) allowed for a
- * rendered page image before OCR is skipped
- */
- public long getOcrMaxImagePixels() {
- return ocr.getMaxImagePixels();
- }
-
- /**
- * Set the maximum total pixels (width × height) for a rendered
- * page image. Pages exceeding this limit are skipped for OCR.
- * Default is 100,000,000 (100 megapixels).
- */
- public void setOcrMaxImagePixels(long ocrMaxImagePixels) {
- ocr.setMaxImagePixels(ocrMaxImagePixels);
- }
-
- /**
- * @return maximum number of pages to OCR per document, or {@code -1}
- * for no limit
- */
- public int getOcrMaxPagesToOcr() {
- return ocr.getMaxPagesToOcr();
- }
-
- /**
- * Set the maximum number of pages to OCR per document.
- * Default is {@code -1} (no limit).
- */
- public void setOcrMaxPagesToOcr(int ocrMaxPagesToOcr) {
- ocr.setMaxPagesToOcr(ocrMaxPagesToOcr);
- }
+ // OCR settings are configured through the nested OcrConfig
(getOcr()/setOcr()).
+ // The flat ocr* convenience accessors (getOcrStrategy/setOcrDPI/...) were
removed in
+ // 4.x so that "ocr" is the single JSON spelling; use
getOcr().setStrategy(...) etc.
/**
* @return whether or not to extract PDActions
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
index 1c6c526398..562de209e9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
@@ -202,7 +202,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer {
if (pdfParserConfig == null) {
return defaultDPI;
}
- return pdfParserConfig.getOcrDPI();
+ return pdfParserConfig.getOcr().getDpi();
}
protected ImageType getImageType(ParseContext parseContext) {
@@ -210,7 +210,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer {
if (pdfParserConfig == null) {
return defaultImageType;
}
- return pdfParserConfig.getOcrImageType().getPdfBoxImageType();
+ return pdfParserConfig.getOcr().getImageType().getPdfBoxImageType();
}
protected String getImageFormatName(ParseContext parseContext) {
@@ -218,6 +218,6 @@ public class PDFBoxRenderer implements PDDocumentRenderer {
if (pdfParserConfig == null) {
return defaultImageFormatName;
}
- return pdfParserConfig.getOcrImageFormat().getFormatName();
+ return pdfParserConfig.getOcr().getImageFormat().getFormatName();
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index b8d3265b8f..947a45dbdd 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1160,8 +1160,8 @@ public class PDFParserTest extends TikaTest {
//this tests that a new PDFParserConfig completely resets
//behavior
config = new PDFParserConfig();
- config.setOcrDPI(10000);
- config.setOcrStrategy(OcrConfig.Strategy.NO_OCR);
+ config.getOcr().setDpi(10000);
+ config.getOcr().setStrategy(OcrConfig.Strategy.NO_OCR);
pc.set(PDFParserConfig.class, config);
text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
text = text.replaceAll("\\s+", " ");
@@ -1207,9 +1207,9 @@ public class PDFParserTest extends TikaTest {
pdfParserConfig.getAccessCheckMode());
assertEquals(true, pdfParserConfig.isExtractInlineImages());
assertEquals(false, pdfParserConfig.isExtractUniqueInlineImagesOnly());
- assertEquals(314, pdfParserConfig.getOcrDPI());
- assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
- assertEquals(OcrConfig.ImageFormat.JPEG,
pdfParserConfig.getOcrImageFormat());
+ assertEquals(314, pdfParserConfig.getOcr().getDpi());
+ assertEquals(2.1f, pdfParserConfig.getOcr().getImageQuality(), .01f);
+ assertEquals(OcrConfig.ImageFormat.JPEG,
pdfParserConfig.getOcr().getImageFormat());
assertEquals(524288000, pdfParserConfig.getMaxMainMemoryBytes());
assertEquals(false, pdfParserConfig.isCatchIntermediateIOExceptions());
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json
index deaea70b79..0e326e2fdf 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.json
@@ -9,9 +9,11 @@
"accessCheckMode": "ALLOW_EXTRACTION_FOR_ACCESSIBILITY",
"catchIntermediateIOExceptions": false,
"extractUniqueInlineImagesOnly": false,
- "ocrDPI": 314,
- "ocrImageQuality": 2.1,
- "ocrImageFormat": "JPEG",
+ "ocr": {
+ "dpi": 314,
+ "imageQuality": 2.1,
+ "imageFormat": "JPEG"
+ },
"maxMainMemoryBytes": 524288000
}
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
index 9989ca2b1a..d645ef73be 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -69,10 +69,10 @@ import org.apache.tika.exception.TikaConfigException;
* {
* // Core Tika components (validated by TikaLoader)
* "parsers": [
- * { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy":
"AUTO", ... } },
+ * { "pdf-parser": { "_mime-include": ["application/pdf"], "ocr":
{"strategy": "AUTO"}, ... } },
* { "html-parser": { ... } },
* { "default-parser": { "exclude": ["some-parser"] } }
- * { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy":
"AUTO" } },
+ * { "pdf-parser": { "_mime-include": ["application/pdf"], "ocr":
{"strategy": "AUTO"} } },
* "html-parser", // String shorthand for no-config
components
* { "default-parser": { "exclude": ["ocr-parser"] } }
* ],
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 23bca1686e..7d6433afa5 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -79,7 +79,7 @@ import
org.apache.tika.serialization.serdes.ParseContextDeserializer;
* "pdf-parser": {
* "_mime-include": ["application/pdf"],
* "_mime-exclude": ["application/pdf+fdf"],
- * "ocrStrategy": "AUTO",
+ * "ocr": {"strategy": "AUTO"},
* "extractInlineImages": true
* }
* }
@@ -434,7 +434,7 @@ public class TikaLoader {
*
* // At runtime, create per-request overrides
* PDFParserConfig requestConfig = new PDFParserConfig();
- * requestConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ * requestConfig.getOcr().setStrategy(OcrConfig.Strategy.NO_OCR);
*
* // Merge: base config values + request overrides
* // (Note: for runtime merging, use JsonMergeUtils directly or
loadConfig on a runtime loader)
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
index 63ea711796..c390e8f410 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
@@ -64,7 +64,7 @@ import
org.apache.tika.serialization.serdes.ParseContextSerializer;
* Supports two formats:
* <ol>
* <li>Simple string: {@code "text-parser"} → instance with defaults</li>
- * <li>Object with type as key: {@code {"pdf-parser": {"ocrStrategy":
"AUTO"}}} → instance with config</li>
+ * <li>Object with type as key: {@code {"pdf-parser": {"ocr": {"strategy":
"AUTO"}}}} → instance with config</li>
* </ol>
* <p>
* For components implementing {@link SelfConfiguring}, uses the {@link
JsonConfig} constructor.
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
index 10578c1f0c..e769de3bac 100644
---
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
@@ -39,7 +39,7 @@ public class TikaJsonConfigTest {
{
"parsers": [
"html-parser",
- { "pdf-parser": { "ocrStrategy": "AUTO" } },
+ { "pdf-parser": { "ocr": { "strategy": "AUTO" } } },
"txt-parser"
]
}
@@ -57,7 +57,8 @@ public class TikaJsonConfigTest {
// Second parser: full object syntax
assertEquals("pdf-parser", parsers.get(1).getKey());
- assertEquals("AUTO",
parsers.get(1).getValue().get("ocrStrategy").asText());
+ assertEquals("AUTO",
+ parsers.get(1).getValue().get("ocr").get("strategy").asText());
// Third parser: string shorthand
assertEquals("txt-parser", parsers.get(2).getKey());
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/RoundTripSerializationTest.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/RoundTripSerializationTest.java
index 5482131289..277580d6c6 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/RoundTripSerializationTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/RoundTripSerializationTest.java
@@ -66,7 +66,7 @@ public class RoundTripSerializationTest {
@Test
void testSingleConfigRoundTrip() throws Exception {
ParseContext original = new ParseContext();
- original.setJsonConfig("pdf-parser",
"{\"ocrStrategy\":\"AUTO\",\"extractInlineImages\":true}");
+ original.setJsonConfig("pdf-parser",
"{\"ocr\":{\"strategy\":\"AUTO\"},\"extractInlineImages\":true}");
String json = mapper.writeValueAsString(original);
ParseContext reloaded = mapper.readValue(json, ParseContext.class);
@@ -80,7 +80,7 @@ public class RoundTripSerializationTest {
@Test
void testMultipleConfigsRoundTrip() throws Exception {
ParseContext original = new ParseContext();
- original.setJsonConfig("pdf-parser", "{\"ocrStrategy\":\"AUTO\"}");
+ original.setJsonConfig("pdf-parser",
"{\"ocr\":{\"strategy\":\"AUTO\"}}");
original.setJsonConfig("html-parser", "{\"extractScripts\":false}");
original.setJsonConfig("timeout-limits",
"{\"progressTimeoutMillis\":30000,\"totalTaskTimeoutMillis\":120000}");
@@ -116,7 +116,7 @@ public class RoundTripSerializationTest {
@Test
void testMultipleRoundTripsStability() throws Exception {
ParseContext context = new ParseContext();
- context.setJsonConfig("pdf-parser", "{\"ocrStrategy\":\"NO_OCR\"}");
+ context.setJsonConfig("pdf-parser",
"{\"ocr\":{\"strategy\":\"NO_OCR\"}}");
context.setJsonConfig("timeout-limits",
"{\"progressTimeoutMillis\":45000,\"totalTaskTimeoutMillis\":180000}");
@@ -223,7 +223,7 @@ public class RoundTripSerializationTest {
String wrappedJson = """
{
"parse-context": {
- "pdf-parser": {"ocrStrategy": "AUTO"}
+ "pdf-parser": {"ocr": {"strategy": "AUTO"}}
}
}
""";
@@ -238,7 +238,7 @@ public class RoundTripSerializationTest {
void testFlatFormatPreferred() throws Exception {
// Verify serialization uses flat format (no wrapper)
ParseContext original = new ParseContext();
- original.setJsonConfig("pdf-parser", "{\"ocrStrategy\":\"AUTO\"}");
+ original.setJsonConfig("pdf-parser",
"{\"ocr\":{\"strategy\":\"AUTO\"}}");
String json = mapper.writeValueAsString(original);
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
index 05669826e2..44cda725f4 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
@@ -102,7 +102,7 @@ public class TestParseContextSerialization {
ParseContext pc = new ParseContext();
// Add friendly-named configurations via setJsonConfig
- pc.setJsonConfig("pdf-parser",
"{\"ocrStrategy\":\"AUTO\",\"extractInlineImages\":true}");
+ pc.setJsonConfig("pdf-parser",
"{\"ocr\":{\"strategy\":\"AUTO\"},\"extractInlineImages\":true}");
pc.setJsonConfig("html-parser", "{\"extractScripts\":false}");
String json = serializeParseContext(pc);
@@ -116,7 +116,8 @@ public class TestParseContextSerialization {
assertTrue(root.has("html-parser"), "Should have html-parser field");
assertEquals("AUTO", root
.get("pdf-parser")
- .get("ocrStrategy")
+ .get("ocr")
+ .get("strategy")
.asText());
assertEquals(false, root
.get("html-parser")
@@ -169,7 +170,7 @@ public class TestParseContextSerialization {
ParseContext pc = new ParseContext();
// Simulate a PDFParserConfig as JSON
- String pdfConfig =
"{\"extractInlineImages\":true,\"ocrStrategy\":\"AUTO\"}";
+ String pdfConfig =
"{\"extractInlineImages\":true,\"ocr\":{\"strategy\":\"AUTO\"}}";
pc.setJsonConfig("pdf-parser", pdfConfig);
// Test hasConfig
@@ -188,7 +189,9 @@ public class TestParseContextSerialization {
String json = """
{
"pdf-parser": {
- "ocrStrategy": "AUTO",
+ "ocr": {
+ "strategy": "AUTO"
+ },
"extractInlineImages": true
},
"html-parser": {
@@ -216,7 +219,9 @@ public class TestParseContextSerialization {
{
"parse-context": {
"pdf-parser": {
- "ocrStrategy": "NO_OCR"
+ "ocr": {
+ "strategy": "NO_OCR"
+ }
}
}
}
@@ -233,7 +238,7 @@ public class TestParseContextSerialization {
// Test with multiple different config types
ParseContext pc = new ParseContext();
- pc.setJsonConfig("pdf-parser", "{\"ocrStrategy\":\"AUTO\"}");
+ pc.setJsonConfig("pdf-parser", "{\"ocr\":{\"strategy\":\"AUTO\"}}");
pc.setJsonConfig("html-parser", "{\"extractScripts\":true}");
pc.setJsonConfig("timeout-limits",
"{\"progressTimeoutMillis\":5000,\"totalTaskTimeoutMillis\":60000}");
diff --git
a/tika-serialization/src/test/resources/configs/example-tika-config.json
b/tika-serialization/src/test/resources/configs/example-tika-config.json
index 7bee601d95..75f507b2f8 100644
--- a/tika-serialization/src/test/resources/configs/example-tika-config.json
+++ b/tika-serialization/src/test/resources/configs/example-tika-config.json
@@ -4,7 +4,9 @@
"pdf-parser": {
"_mime-include": ["application/pdf"],
"_mime-exclude": ["application/pdf+fdf"],
- "ocrStrategy": "AUTO",
+ "ocr": {
+ "strategy": "AUTO"
+ },
"extractInlineImages": true
}
},
diff --git
a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json
b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json
index 45f3d3bf72..a96760bb83 100644
---
a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json
+++
b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json
@@ -1,15 +1,17 @@
{
"_comment": [
"Render each PDF page as an image and run Tesseract on it.",
- "ocrStrategy options: no_ocr, ocr_only, ocr_and_text, auto."
+ "ocr.strategy options: no_ocr, ocr_only, ocr_and_text_extraction, auto."
],
"parsers": [
{ "tesseract-ocr-parser": {} },
{
"pdf-parser": {
- "ocrStrategy": "ocr_only",
- "ocrImageType": "rgb",
- "ocrDPI": 100
+ "ocr": {
+ "strategy": "ocr_only",
+ "imageType": "rgb",
+ "dpi": 100
+ }
}
}
]
diff --git
a/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json
b/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json
index fadb08a55f..c093108a52 100644
---
a/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json
+++
b/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json
@@ -9,7 +9,9 @@
{
"pdf-parser": {
"extractInlineImages": true,
- "ocrStrategy": "AUTO"
+ "ocr": {
+ "strategy": "AUTO"
+ }
}
},
{
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
index 68f9eed897..58ef99ecc6 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
@@ -263,7 +263,9 @@ public class TikaResourceTest extends CXFTestBase {
String configJson = """
{
"pdf-parser": {
- "ocrStrategy": "OCR_ONLY"
+ "ocr": {
+ "strategy": "OCR_ONLY"
+ }
},
"tesseract-ocr-parser": {
"language": "eng+fra",
@@ -297,7 +299,9 @@ public class TikaResourceTest extends CXFTestBase {
String configJson = """
{
"pdf-parser": {
- "ocrStrategy": "NO_OCR"
+ "ocr": {
+ "strategy": "NO_OCR"
+ }
}
}
""";
@@ -335,7 +339,9 @@ public class TikaResourceTest extends CXFTestBase {
configJson = """
{
"pdf-parser": {
- "ocrStrategy": "OCR_ONLY"
+ "ocr": {
+ "strategy": "OCR_ONLY"
+ }
}
}
""";
@@ -356,7 +362,9 @@ public class TikaResourceTest extends CXFTestBase {
configJson = """
{
"pdf-parser": {
- "ocrStrategy": "non-sense-value"
+ "ocr": {
+ "strategy": "non-sense-value"
+ }
}
}
""";
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index 497dfd812c..6d1c0149cc 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -336,7 +336,9 @@ public class UnpackerResourceTest extends CXFTestBase {
String configJson = """
{
"pdf-parser": {
- "ocrStrategy": "OCR_ONLY"
+ "ocr": {
+ "strategy": "OCR_ONLY"
+ }
}
}
""";
@@ -369,7 +371,9 @@ public class UnpackerResourceTest extends CXFTestBase {
{
"pdf-parser": {
"imageStrategy": "RENDER_PAGES_AT_PAGE_END",
- "ocrImageType": "RGB"
+ "ocr": {
+ "imageType": "RGB"
+ }
}
}
""";
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
index 09fd301f61..6f35a72eae 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
@@ -132,8 +132,10 @@ public class UnpackerResourceWithConfigTest extends
CXFTestBase {
{
"pdf-parser": {
"imageStrategy": "RENDER_PAGES_AT_PAGE_END",
- "ocrImageType": "RGB",
- "ocrImageFormat": "TIFF"
+ "ocr": {
+ "imageType": "RGB",
+ "imageFormat": "TIFF"
+ }
}
}
""";
@@ -203,8 +205,10 @@ public class UnpackerResourceWithConfigTest extends
CXFTestBase {
{
"pdf-parser": {
"imageStrategy": "RENDER_PAGES_AT_PAGE_END",
- "ocrImageType": "GRAY",
- "ocrImageFormat": "JPEG"
+ "ocr": {
+ "imageType": "GRAY",
+ "imageFormat": "JPEG"
+ }
}
}
""";