This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f63bebfdea TIKA-4565 -- tweak configurations for include/exclude
(#2441)
f63bebfdea is described below
commit f63bebfdea38c152ab1ffdff591938d7ef8c02b3
Author: Tim Allison <[email protected]>
AuthorDate: Thu Dec 11 12:44:13 2025 -0500
TIKA-4565 -- tweak configurations for include/exclude (#2441)
---
.../apache/tika/cli/XmlToJsonConfigConverter.java | 8 ++--
.../tika/cli/XmlToJsonConfigConverterTest.java | 6 +--
.../src/test/resources/configs/tika-config1.json | 18 ++++----
.../src/test/resources/configs/tika-config2.json | 17 ++------
.../src/test/resources/s3/tika-config-s3.json | 14 +++----
.../configs/TIKA-1702-detector-exclude.json | 2 +-
.../configs/TIKA-1708-detector-default.json | 13 +++---
...2273-encoding-detector-outside-static-init.json | 2 +-
...IKA-2273-exclude-encoding-detector-default.json | 2 +-
.../TIKA-2273-no-icu4j-encoding-detector.json | 2 +-
.../configs/test-default-with-exclusions.json | 2 +-
.../test/resources/configs/tika-4424-config.json | 7 +---
.../configs/tika-config-digests-pdf-only.json | 2 +-
.../resources/configs/tika-config-lib-pst.json | 2 +-
.../org/apache/tika/config/TIKA-1558-exclude.json | 10 ++---
.../apache/tika/config/TIKA-1558-excludesub.json | 2 +-
.../apache/tika/parser/ocr/tesseract-config.json | 2 +-
.../apache/tika/config/loader/DetectorLoader.java | 4 +-
.../tika/config/loader/EncodingDetectorLoader.java | 4 +-
.../apache/tika/config/loader/FrameworkConfig.java | 49 ++++++----------------
.../apache/tika/config/loader/ParserLoader.java | 40 ++++--------------
.../apache/tika/config/loader/TikaJsonConfig.java | 4 ++
.../org/apache/tika/config/loader/TikaLoader.java | 26 ++++++------
.../tika/config/loader/FrameworkConfigTest.java | 49 +++++++++++++---------
.../apache/tika/config/loader/TikaLoaderTest.java | 12 +++---
.../resources/configs/example-tika-config.json | 11 ++---
.../resources/configs/test-decoration-config.json | 6 +--
.../test-default-parser-with-exclusions.json | 2 +-
.../test/resources/configs/test-loader-config.json | 5 +--
29 files changed, 129 insertions(+), 194 deletions(-)
diff --git
a/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
index fc43b553d5..9be0ee12ee 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
@@ -118,7 +118,7 @@ import org.apache.tika.utils.XMLReaderUtils;
* },
* {
* "default-parser": {
- * "exclude": ["pdf-parser"]
+ * "_exclude": ["pdf-parser"]
* }
* }
* ]
@@ -257,9 +257,9 @@ public class XmlToJsonConfigConverter {
for (Map<String, Object> parserEntry : parsersList) {
if (parserEntry.containsKey("default-parser")) {
Map<?, ?> config = (Map<?, ?>)
parserEntry.get("default-parser");
- if (config.containsKey("exclude")) {
+ if (config.containsKey("_exclude")) {
@SuppressWarnings("unchecked")
- List<String> excludes = (List<String>)
config.get("exclude");
+ List<String> excludes = (List<String>)
config.get("_exclude");
excludedParsers.addAll(excludes);
}
}
@@ -364,7 +364,7 @@ public class XmlToJsonConfigConverter {
}
if (excludes != null && !excludes.isEmpty()) {
- config.put("exclude", excludes);
+ config.put("_exclude", excludes);
}
Map<String, Object> result = new LinkedHashMap<>();
diff --git
a/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
index 98671c37bd..6f73f810d3 100644
---
a/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
+++
b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
@@ -92,8 +92,8 @@ public class XmlToJsonConfigConverterTest {
System.out.println("Generated JSON:");
System.out.println(json);
- // Verify exclude is at the correct level (not under _decorate)
- assertTrue(json.contains("\"exclude\""), "Should have exclude array");
+ // Verify exclude is at the correct level (with underscore prefix)
+ assertTrue(json.contains("\"_exclude\""), "Should have _exclude
array");
assertFalse(json.contains("\"_decorate\""), "_decorate should not be
used for parser excludes");
assertTrue(json.contains("\"jsoup-parser\""), "Should exclude
jsoup-parser");
assertTrue(json.contains("\"pdf-parser\""), "Should exclude
pdf-parser");
@@ -224,7 +224,7 @@ public class XmlToJsonConfigConverterTest {
System.out.println(json);
// Verify the JSON still contains the exclusions (we don't remove
them, just inform)
- assertTrue(json.contains("\"exclude\""), "Should still have exclude
array");
+ assertTrue(json.contains("\"_exclude\""), "Should still have _exclude
array");
assertTrue(json.contains("\"pdf-parser\""), "Should have pdf-parser
configured");
assertTrue(json.contains("\"jsoup-parser\""), "Should have
jsoup-parser configured");
diff --git a/tika-app/src/test/resources/configs/tika-config1.json
b/tika-app/src/test/resources/configs/tika-config1.json
index e4cdbaf96e..1b5f391a97 100644
--- a/tika-app/src/test/resources/configs/tika-config1.json
+++ b/tika-app/src/test/resources/configs/tika-config1.json
@@ -2,16 +2,14 @@
"parsers": [
{
"jsoup-parser": {
- "_decorate": {
- "mimeInclude": [
- "application/vnd.wap.xhtml+xml",
- "application/x-asp",
- "application/xhtml+xml",
- "text/html",
- "application/xml",
- "text/xml"
- ]
- }
+ "_mime-include": [
+ "application/vnd.wap.xhtml+xml",
+ "application/x-asp",
+ "application/xhtml+xml",
+ "text/html",
+ "application/xml",
+ "text/xml"
+ ]
}
}
]
diff --git a/tika-app/src/test/resources/configs/tika-config2.json
b/tika-app/src/test/resources/configs/tika-config2.json
index 0f3cf8ac41..d25f49d852 100644
--- a/tika-app/src/test/resources/configs/tika-config2.json
+++ b/tika-app/src/test/resources/configs/tika-config2.json
@@ -2,24 +2,13 @@
"parsers": [
{
"default-parser": {
- "_decorate": {
- "mimeExclude": [
- "image/jpeg",
- "application/pdf"
- ],
- "parserExclude": [
- "org.apache.tika.parser.executable.ExecutableParser"
- ]
- }
+ "_exclude": ["executable-parser"],
+ "_mime-exclude": ["image/jpeg", "application/pdf"]
}
},
{
"empty-parser": {
- "_decorate": {
- "mimeInclude": [
- "application/pdf"
- ]
- }
+ "_mime-include": ["application/pdf"]
}
}
]
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
index 043da2349f..e16f0a9b6b 100644
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
+++
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/tika-config-s3.json
@@ -2,14 +2,12 @@
"parsers": [
{
"default-parser": {
- "_decorate": {
- "parserExclude": [
- "org.apache.tika.parser.ocr.TesseractOCRParser",
- "org.apache.tika.parser.pdf.PDFParser",
- "org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
- "org.apache.tika.parser.microsoft.OfficeParser"
- ]
- }
+ "_exclude": [
+ "tesseract-ocr-parser",
+ "pdf-parser",
+ "ooxml-parser",
+ "office-parser"
+ ]
}
},
{
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1702-detector-exclude.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1702-detector-exclude.json
index fe356421d3..80a611f6b5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1702-detector-exclude.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1702-detector-exclude.json
@@ -3,7 +3,7 @@
"detectors": [
{
"default-detector": {
- "exclude": [
+ "_exclude": [
"default-zip-container-detector",
"poifs-container-detector"
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json
index 4d76bc86a9..4c49c1e460 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json
@@ -1,14 +1,17 @@
{
+ "parsers": [],
"detectors": [
{
- "default-detector": {
- "exclude": [
+ "default-detector" : {
+ "_exclude": [
"default-zip-container-detector"
]
}
}
],
- "translator": {
- "class": "default-translator"
- }
+ "translator": [
+ {
+ "default-translator": {}
+ }
+ ]
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-encoding-detector-outside-static-init.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-encoding-detector-outside-static-init.json
index 2c05becdc8..c1818466e3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-encoding-detector-outside-static-init.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-encoding-detector-outside-static-init.json
@@ -10,7 +10,7 @@
"encoding-detectors": [
{
"default-encoding-detector" : {
- "exclude":["icu4j-encoding-detector"]
+ "_exclude":["icu4j-encoding-detector"]
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json
index 240924a28c..56327103c5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json
@@ -3,7 +3,7 @@
"encoding-detectors": [
{
"default-encoding-detector": {
- "exclude": [
+ "_exclude": [
"html-encoding-detector"
]
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-no-icu4j-encoding-detector.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-no-icu4j-encoding-detector.json
index 8099326159..b37a45121a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-no-icu4j-encoding-detector.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-no-icu4j-encoding-detector.json
@@ -2,7 +2,7 @@
"encoding-detectors": [
{
"default-encoding-detector":{
- "exclude": [
+ "_exclude": [
"icu4j-encoding-detector"
]
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/test-default-with-exclusions.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/test-default-with-exclusions.json
index c29e0f4208..5233e290fe 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/test-default-with-exclusions.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/test-default-with-exclusions.json
@@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
- "exclude": ["pdf-parser", "jsoup-parser"]
+ "_exclude": ["pdf-parser", "jsoup-parser"]
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.json
index 0175fe3181..82a03978b2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.json
@@ -1,11 +1,8 @@
{
"detectors": [
{
- "_name": "default-detector",
- "_decorate": {
- "detectorExclude": [
- "org.apache.tika.detect.zip.DefaultZipContainerDetector"
- ]
+ "default-detector": {
+ "_exclude": ["default-zip-container-detector"]
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
index 9f31bfbc9e..34e5248c7c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
@@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
- "exclude": [
+ "_exclude": [
"pdf-parser"
]
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json
index 1396afc7af..da45f42ee8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json
@@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
- "exclude": [
+ "_exclude": [
"outlook-pst-parser",
"pst-mail-item-parser"
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json
index 10101b8536..d25f49d852 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json
@@ -2,17 +2,13 @@
"parsers": [
{
"default-parser": {
- "exclude": ["executable-parser"],
- "_decorate": {
- "mimeExclude": ["image/jpeg", "application/pdf"]
- }
+ "_exclude": ["executable-parser"],
+ "_mime-exclude": ["image/jpeg", "application/pdf"]
}
},
{
"empty-parser": {
- "_decorate": {
- "mimeInclude": ["application/pdf"]
- }
+ "_mime-include": ["application/pdf"]
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.json
index aa34ec2fbd..0ec57f490a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.json
@@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
- "exclude": ["xml-parser", "dc-xml-parser", "fiction-book-parser"]
+ "_exclude": ["xml-parser", "dc-xml-parser", "fiction-book-parser"]
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
index 672584b483..00c67e9ebe 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
@@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
- "exclude": ["tesseract-ocr-parser"]
+ "_exclude": ["tesseract-ocr-parser"]
}
},
{
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
index 79b0840abd..321cf878e5 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
@@ -85,8 +85,8 @@ public class DetectorLoader {
// Parse exclusions from default-detector config
JsonNode configNode = entry.getValue();
- if (configNode != null && configNode.has("exclude")) {
- JsonNode excludeNode = configNode.get("exclude");
+ if (configNode != null && configNode.has("_exclude")) {
+ JsonNode excludeNode = configNode.get("_exclude");
if (excludeNode.isArray()) {
for (JsonNode excludeName : excludeNode) {
if (excludeName.isTextual()) {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java
index 426140ff80..66fa71adc8 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/EncodingDetectorLoader.java
@@ -85,8 +85,8 @@ public class EncodingDetectorLoader {
// Parse exclusions from default-encoding-detector config
JsonNode configNode = entry.getValue();
- if (configNode != null && configNode.has("exclude")) {
- JsonNode excludeNode = configNode.get("exclude");
+ if (configNode != null && configNode.has("_exclude")) {
+ JsonNode excludeNode = configNode.get("_exclude");
if (excludeNode.isArray()) {
for (JsonNode excludeName : excludeNode) {
if (excludeName.isTextual()) {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
index 573aa75a47..96a101d34a 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/FrameworkConfig.java
@@ -31,14 +31,16 @@ import org.apache.tika.config.JsonConfig;
* Extracts framework-level configuration from component JSON,
* separating fields prefixed with underscore from component-specific config.
*
- * <p>Framework fields:
+ * <p>Framework fields (underscore prefix):
* <ul>
- * <li>{@code _decorate} - Parser decoration config (mime filtering,
fallbacks)</li>
+ * <li>{@code _mime-include} - Only handle these mime types</li>
+ * <li>{@code _mime-exclude} - Don't handle these mime types</li>
* </ul>
*/
public class FrameworkConfig {
- private static final String DECORATE_KEY = "_decorate";
+ private static final String MIME_INCLUDE_KEY = "_mime-include";
+ private static final String MIME_EXCLUDE_KEY = "_mime-exclude";
private final ParserDecoration decoration;
private final JsonConfig componentConfigJson;
@@ -66,11 +68,13 @@ public class FrameworkConfig {
ObjectNode objNode = (ObjectNode) configNode.deepCopy();
- // Extract decoration (parser-specific)
+ // Extract mime filtering config (framework-level, underscore prefix)
+ List<String> mimeInclude =
parseStringList(objNode.remove(MIME_INCLUDE_KEY));
+ List<String> mimeExclude =
parseStringList(objNode.remove(MIME_EXCLUDE_KEY));
+
ParserDecoration decoration = null;
- if (objNode.has(DECORATE_KEY)) {
- JsonNode decorateNode = objNode.remove(DECORATE_KEY);
- decoration = parseDecoration(decorateNode);
+ if (!mimeInclude.isEmpty() || !mimeExclude.isEmpty()) {
+ decoration = new ParserDecoration(mimeInclude, mimeExclude);
}
// Remaining fields are component-specific config
@@ -80,22 +84,6 @@ public class FrameworkConfig {
return new FrameworkConfig(decoration, componentConfigJson);
}
- private static ParserDecoration parseDecoration(JsonNode decorateNode) {
- if (decorateNode == null || !decorateNode.isObject()) {
- return null;
- }
-
- List<String> mimeInclude =
parseStringList(decorateNode.get("mimeInclude"));
- List<String> mimeExclude =
parseStringList(decorateNode.get("mimeExclude"));
- List<String> fallbacks =
parseStringList(decorateNode.get("fallbacks"));
-
- if (mimeInclude.isEmpty() && mimeExclude.isEmpty() &&
fallbacks.isEmpty()) {
- return null;
- }
-
- return new ParserDecoration(mimeInclude, mimeExclude, fallbacks);
- }
-
private static List<String> parseStringList(JsonNode node) {
if (node == null) {
return Collections.emptyList();
@@ -124,18 +112,15 @@ public class FrameworkConfig {
}
/**
- * Parser decoration configuration for mime type filtering and fallbacks.
+ * Parser decoration configuration for mime type filtering.
*/
public static class ParserDecoration {
private final List<String> mimeInclude;
private final List<String> mimeExclude;
- private final List<String> fallbacks;
- public ParserDecoration(List<String> mimeInclude, List<String>
mimeExclude,
- List<String> fallbacks) {
+ public ParserDecoration(List<String> mimeInclude, List<String>
mimeExclude) {
this.mimeInclude = Collections.unmodifiableList(mimeInclude);
this.mimeExclude = Collections.unmodifiableList(mimeExclude);
- this.fallbacks = Collections.unmodifiableList(fallbacks);
}
public List<String> getMimeInclude() {
@@ -146,16 +131,8 @@ public class FrameworkConfig {
return mimeExclude;
}
- public List<String> getFallbacks() {
- return fallbacks;
- }
-
public boolean hasFiltering() {
return !mimeInclude.isEmpty() || !mimeExclude.isEmpty();
}
-
- public boolean hasFallbacks() {
- return !fallbacks.isEmpty();
- }
}
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
index 95f0dc6166..aa19032f93 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
@@ -42,13 +42,11 @@ import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.RenderingParser;
-import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
-import org.apache.tika.parser.multiple.FallbackParser;
import org.apache.tika.renderer.Renderer;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
- * Loader for parsers with support for decoration (mime type filtering,
fallbacks).
+ * Loader for parsers with support for decoration (mime type filtering).
*/
public class ParserLoader {
@@ -112,16 +110,16 @@ public class ParserLoader {
// Parse exclusions from default-parser config
JsonNode configNode = entry.getValue();
- // Check for common mistake: using "excludes" instead of
"exclude"
- if (configNode != null && configNode.has("excludes")) {
+ // Check for common mistake: using "_excludes" instead of
"_exclude"
+ if (configNode != null && configNode.has("_excludes")) {
throw new TikaConfigException(
- "Invalid configuration for default-parser: found
'excludes' but the correct " +
- "field name is 'exclude' (singular). Please change
'excludes' to 'exclude' " +
+ "Invalid configuration for default-parser: found
'_excludes' but the correct " +
+ "field name is '_exclude' (singular). Please
change '_excludes' to '_exclude' " +
"in your configuration.");
}
- if (configNode != null && configNode.has("exclude")) {
- JsonNode excludeNode = configNode.get("exclude");
+ if (configNode != null && configNode.has("_exclude")) {
+ JsonNode excludeNode = configNode.get("_exclude");
if (excludeNode.isArray()) {
for (JsonNode excludeName : excludeNode) {
if (excludeName.isTextual()) {
@@ -198,11 +196,6 @@ public class ParserLoader {
if (parsed.decoration.hasFiltering()) {
parser = applyMimeFiltering(parser, parsed.decoration);
}
-
- // Apply fallbacks
- if (parsed.decoration.hasFallbacks()) {
- parser = applyFallbacks(parser, parsed.decoration,
parsedConfigs);
- }
}
parserList.add(parser);
@@ -342,25 +335,6 @@ public class ParserLoader {
return parser;
}
- private Parser applyFallbacks(Parser parser,
FrameworkConfig.ParserDecoration decoration,
- Map<String, ParsedParserConfig>
parsedConfigs)
- throws TikaConfigException {
-
- List<String> fallbackNames = decoration.getFallbacks();
- List<Parser> fallbackParsers = new ArrayList<>();
- fallbackParsers.add(parser); // Primary parser first
-
- for (String fallbackName : fallbackNames) {
- ParsedParserConfig fallbackConfig =
parsedConfigs.get(fallbackName);
- if (fallbackConfig == null) {
- throw new TikaConfigException("Unknown fallback parser: " +
fallbackName);
- }
- fallbackParsers.add(fallbackConfig.parser);
- }
-
- return new FallbackParser(TikaLoader.getMediaTypeRegistry(),
MetadataPolicy.KEEP_ALL, fallbackParsers);
- }
-
private List<Parser> loadSpiParsers(Set<Class<?>> excludeClasses) {
List<Parser> result = new ArrayList<>();
ServiceLoader<Parser> serviceLoader = ServiceLoader.load(Parser.class,
classLoader);
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
index d90854666d..06da0f3175 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -69,6 +69,9 @@ import org.apache.tika.exception.TikaConfigException;
* {
* // Core Tika components (validated by TikaLoader)
* "parsers": [
+ * { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy":
"AUTO", ... } },
+ * { "html-parser": { ... } },
+ * { "default-parser": { "_exclude": ["some-parser"] } }
* { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy":
"AUTO" } },
* "html-parser", // String shorthand for no-config
components
* { "default-parser": { "_exclude": ["ocr-parser"] } }
@@ -86,6 +89,7 @@ import org.apache.tika.exception.TikaConfigException;
* </pre>
*
* <p>All components use array format for explicit ordering.
+ * Parsers support decoration via "_mime-include" and "_mime-exclude" fields.
* Components without configuration can use string shorthand: "component-name"
* instead of { "component-name": {} }.
* Parsers support mime filtering via "_mime-include" and "_mime-exclude"
fields.
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index e3a4c63a6d..01aa21e0f6 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -54,21 +54,19 @@ import org.apache.tika.renderer.Renderer;
* <p>JSON configuration format:
* <pre>
* {
- * "parsers": {
- * "pdf-parser": {
- * "_priority": 10,
- * "_decorate": {
- * "mimeInclude": ["application/pdf"],
- * "mimeExclude": ["application/pdf+fdf"],
- * "fallbacks": ["empty-parser"]
- * },
- * "ocrStrategy": "AUTO",
- * "extractInlineImages": true
+ * "parsers": [
+ * {
+ * "pdf-parser": {
+ * "_mime-include": ["application/pdf"],
+ * "_mime-exclude": ["application/pdf+fdf"],
+ * "ocrStrategy": "AUTO",
+ * "extractInlineImages": true
+ * }
* }
- * },
- * "detectors": {
- * "mime-magic-detector": { ... }
- * }
+ * ],
+ * "detectors": [
+ * { "mime-magic-detector": { ... } }
+ * ]
* }
* </pre>
*/
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java
index 0fd8ffaa6c..9a8444bbf4 100644
---
a/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/FrameworkConfigTest.java
@@ -37,11 +37,8 @@ public class FrameworkConfigTest {
public void testExtractDecoration() throws Exception {
String json = """
{
- "_decorate": {
- "mimeInclude": ["application/pdf"],
- "mimeExclude": ["application/pdf+fdf"],
- "fallbacks": ["backup-parser"]
- },
+ "_mime-include": ["application/pdf"],
+ "_mime-exclude": ["application/pdf+fdf"],
"name": "test"
}
""";
@@ -53,7 +50,6 @@ public class FrameworkConfigTest {
FrameworkConfig.ParserDecoration decoration = config.getDecoration();
assertTrue(decoration.hasFiltering(), "Should have filtering");
- assertTrue(decoration.hasFallbacks(), "Should have fallbacks");
assertEquals(1, decoration.getMimeInclude().size());
assertEquals("application/pdf", decoration.getMimeInclude().get(0));
@@ -61,11 +57,10 @@ public class FrameworkConfigTest {
assertEquals(1, decoration.getMimeExclude().size());
assertEquals("application/pdf+fdf",
decoration.getMimeExclude().get(0));
- assertEquals(1, decoration.getFallbacks().size());
- assertEquals("backup-parser", decoration.getFallbacks().get(0));
-
-
assertFalse(config.getComponentConfigJson().json().contains("_decorate"),
- "Component config should not contain _decorate");
+
assertFalse(config.getComponentConfigJson().json().contains("_mime-include"),
+ "Component config should not contain _mime-include");
+
assertFalse(config.getComponentConfigJson().json().contains("_mime-exclude"),
+ "Component config should not contain _mime-exclude");
}
@Test
@@ -83,10 +78,10 @@ public class FrameworkConfigTest {
}
@Test
- public void testEmptyDecoration() throws Exception {
+ public void testMimeIncludeOnly() throws Exception {
String json = """
{
- "_decorate": {},
+ "_mime-include": ["text/plain"],
"name": "test"
}
""";
@@ -94,17 +89,33 @@ public class FrameworkConfigTest {
FrameworkConfig config = FrameworkConfig.extract(node, MAPPER);
- // Empty decoration should return null
- assertNull(config.getDecoration(), "Empty decoration should be null");
+ assertNotNull(config.getDecoration(), "Decoration should be present");
+ assertEquals(1, config.getDecoration().getMimeInclude().size());
+ assertTrue(config.getDecoration().getMimeExclude().isEmpty());
+ }
+
+ @Test
+ public void testMimeExcludeOnly() throws Exception {
+ String json = """
+ {
+ "_mime-exclude": ["image/jpeg"],
+ "name": "test"
+ }
+ """;
+ JsonNode node = MAPPER.readTree(json);
+
+ FrameworkConfig config = FrameworkConfig.extract(node, MAPPER);
+
+ assertNotNull(config.getDecoration(), "Decoration should be present");
+ assertTrue(config.getDecoration().getMimeInclude().isEmpty());
+ assertEquals(1, config.getDecoration().getMimeExclude().size());
}
@Test
public void testComponentConfigJsonClean() throws Exception {
String json = """
{
- "_decorate": {
- "mimeInclude": ["text/plain"]
- },
+ "_mime-include": ["text/plain"],
"bufferSize": 1024,
"enabled": true
}
@@ -116,7 +127,7 @@ public class FrameworkConfigTest {
String componentJson = config.getComponentConfigJson().json();
// Verify framework fields are removed
- assertFalse(componentJson.contains("_decorate"), "Should not contain
_decorate");
+ assertFalse(componentJson.contains("_mime-include"), "Should not
contain _mime-include");
// Verify component fields remain
assertTrue(componentJson.contains("bufferSize"), "Should contain
bufferSize");
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
index 435282998a..85e0ed1e18 100644
---
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
@@ -362,12 +362,12 @@ public class TikaLoaderTest {
@Test
public void testExcludesInsteadOfExcludeThrowsException() throws Exception
{
- // Create a config with the common mistake: "excludes" instead of
"exclude"
+ // Create a config with the common mistake: "_excludes" instead of
"_exclude"
String invalidConfig = "{\n" +
" \"parsers\": [\n" +
" {\n" +
" \"default-parser\": {\n" +
- " \"excludes\": [\"pdf-parser\"]\n" +
+ " \"_excludes\": [\"pdf-parser\"]\n" +
" }\n" +
" }\n" +
" ]\n" +
@@ -385,10 +385,10 @@ public class TikaLoaderTest {
throw new AssertionError("Expected TikaConfigException to be
thrown");
} catch (org.apache.tika.exception.TikaConfigException e) {
// Expected - verify the error message is helpful
- assertTrue(e.getMessage().contains("excludes"),
- "Error message should mention 'excludes'");
- assertTrue(e.getMessage().contains("exclude"),
- "Error message should mention the correct field
'exclude'");
+ assertTrue(e.getMessage().contains("_excludes"),
+ "Error message should mention '_excludes'");
+ assertTrue(e.getMessage().contains("_exclude"),
+ "Error message should mention the correct field
'_exclude'");
assertTrue(e.getMessage().contains("singular"),
"Error message should explain it should be singular");
}
diff --git
a/tika-serialization/src/test/resources/configs/example-tika-config.json
b/tika-serialization/src/test/resources/configs/example-tika-config.json
index e6810d34bd..acf6125587 100644
--- a/tika-serialization/src/test/resources/configs/example-tika-config.json
+++ b/tika-serialization/src/test/resources/configs/example-tika-config.json
@@ -2,20 +2,15 @@
"parsers": [
{
"pdf-parser": {
- "_decorate": {
- "mimeInclude": ["application/pdf"],
- "mimeExclude": ["application/pdf+fdf"],
- "fallbacks": ["empty-parser"]
- },
+ "_mime-include": ["application/pdf"],
+ "_mime-exclude": ["application/pdf+fdf"],
"ocrStrategy": "AUTO",
"extractInlineImages": true
}
},
{
"html-parser": {
- "_decorate": {
- "mimeExclude": ["application/xhtml+xml"]
- },
+ "_mime-exclude": ["application/xhtml+xml"],
"encoding": "UTF-8"
}
},
diff --git
a/tika-serialization/src/test/resources/configs/test-decoration-config.json
b/tika-serialization/src/test/resources/configs/test-decoration-config.json
index 63e5b169e5..9568a8f47d 100644
--- a/tika-serialization/src/test/resources/configs/test-decoration-config.json
+++ b/tika-serialization/src/test/resources/configs/test-decoration-config.json
@@ -2,10 +2,8 @@
"parsers": [
{
"configurable-test-parser": {
- "_decorate": {
- "mimeInclude": ["application/pdf", "text/plain"],
- "mimeExclude": ["application/pdf+fdf"]
- },
+ "_mime-include": ["application/pdf", "text/plain"],
+ "_mime-exclude": ["application/pdf+fdf"],
"name": "filtered-parser",
"bufferSize": 4096
}
diff --git
a/tika-serialization/src/test/resources/configs/test-default-parser-with-exclusions.json
b/tika-serialization/src/test/resources/configs/test-default-parser-with-exclusions.json
index 1d6c1dab9e..e8c90fe201 100644
---
a/tika-serialization/src/test/resources/configs/test-default-parser-with-exclusions.json
+++
b/tika-serialization/src/test/resources/configs/test-default-parser-with-exclusions.json
@@ -9,7 +9,7 @@
},
{
"default-parser": {
- "exclude": ["minimal-test-parser", "fallback-test-parser"]
+ "_exclude": ["minimal-test-parser", "fallback-test-parser"]
}
}
]
diff --git
a/tika-serialization/src/test/resources/configs/test-loader-config.json
b/tika-serialization/src/test/resources/configs/test-loader-config.json
index 1c1db9688b..d270d8f788 100644
--- a/tika-serialization/src/test/resources/configs/test-loader-config.json
+++ b/tika-serialization/src/test/resources/configs/test-loader-config.json
@@ -10,10 +10,7 @@
},
{
"fallback-test-parser": {
- "_decorate": {
- "mimeInclude": ["application/test+fallback"],
- "fallbacks": ["minimal-test-parser"]
- },
+ "_mime-include": ["application/test+fallback"],
"message": "primary parser",
"failOnPurpose": false
}