This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new de0058d52 TIKA-4545 - swap in concrete classes in
AutoDetectParserConfig to help Jackson (#2422)
de0058d52 is described below
commit de0058d525f2ee8dd963487cb5958f136fb63b4f
Author: Tim Allison <[email protected]>
AuthorDate: Fri Dec 5 06:30:16 2025 -0500
TIKA-4545 - swap in concrete classes in AutoDetectParserConfig to help
Jackson (#2422)
* TIKA-4545 - swap in concrete classes to help Jackson
---
.../java/org/apache/tika/config/ConfigBase.java | 21 +++++------
.../tika/extractor/RUnpackExtractorFactory.java | 41 +++++++++-------------
.../writefilter/StandardWriteFilterFactory.java | 27 ++++++--------
.../tika/parser/AutoDetectParserConfigTest.java | 15 ++++++++
...a-config-upcasing-custom-handler-decorator.json | 15 ++------
.../configs/tika-config-write-filter.json | 21 +++++++++++
6 files changed, 78 insertions(+), 62 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java
b/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java
index 9cae53207..f30949b63 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java
@@ -31,7 +31,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
-import java.util.TreeSet;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
@@ -248,14 +247,14 @@ public abstract class ConfigBase {
SetterClassPair setterClassPair = findSetterClassPair(object,
itemName);
boolean processed = false;
if (!hasClass(param)) {
- if (setterClassPair.itemClass.isAssignableFrom(Map.class) &&
isMap(param)) {
+ if (Map.class.isAssignableFrom(setterClassPair.itemClass) &&
isMap(param)) {
tryToSetMap(object, param);
processed = true;
- } else if
(setterClassPair.itemClass.isAssignableFrom(List.class)) {
+ } else if
(List.class.isAssignableFrom(setterClassPair.itemClass)) {
tryToSetList(object, param);
processed = true;
- } else if
(setterClassPair.itemClass.isAssignableFrom(Set.class)) {
- tryToSetSet(object, param);
+ } else if
(Set.class.isAssignableFrom(setterClassPair.itemClass)) {
+ tryToSetSet(object, param, setterClassPair.itemClass);
processed = true;
}
}
@@ -363,14 +362,16 @@ public abstract class ConfigBase {
return false;
}
- private static void tryToSetSet(Object object, Node param) throws
TikaConfigException {
+ private static void tryToSetSet(Object object, Node param, Class clazz)
throws TikaConfigException {
//simple hack for now -- only handle Set<String>
- tryToSetStringSet(object, param);
+ tryToSetStringSet(object, param, clazz);
}
- private static void tryToSetStringSet(Object object, Node param) throws
TikaConfigException {
+ private static void tryToSetStringSet(Object object, Node param, Class
clazz) throws TikaConfigException {
String name = param.getLocalName();
- Set<String> strings = new TreeSet<>();
+ //this is luck.
+ HashSet<String> strings = new HashSet<>();
+
NodeList nodeList = param.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
Node n = nodeList.item(i);
@@ -383,7 +384,7 @@ public abstract class ConfigBase {
}
String setter = "set" + name.substring(0, 1).toUpperCase(Locale.US) +
name.substring(1);
try {
- Method m = object.getClass().getMethod(setter, Set.class);
+ Method m = object.getClass().getMethod(setter, clazz);
m.invoke(object, strings);
} catch (NoSuchMethodException | InvocationTargetException |
IllegalAccessException e) {
throw new TikaConfigException("can't set " + name, e);
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
index aed34f535..5813ed3ab 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.extractor;
-import java.util.Collections;
import java.util.HashSet;
-import java.util.Set;
import org.apache.tika.config.Field;
import org.apache.tika.config.TikaComponent;
@@ -32,10 +30,11 @@ public class RUnpackExtractorFactory implements
EmbeddedDocumentByteStoreExtract
public static long DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION = 10l * 1024l
* 1024l * 1024l;
private boolean writeFileNameToContent = true;
- private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET;
- private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET;
- private Set<String> embeddedBytesIncludeEmbeddedResourceTypes =
Collections.EMPTY_SET;
- private Set<String> embeddedBytesExcludeEmbeddedResourceTypes =
Collections.EMPTY_SET;
+ //concrete HashSet class for the sake of Jackson
+ private HashSet<String> embeddedBytesIncludeMimeTypes = new HashSet<>();
+ private HashSet<String> embeddedBytesExcludeMimeTypes = new HashSet<>();
+ private HashSet<String> embeddedBytesIncludeEmbeddedResourceTypes = new
HashSet<>();
+ private HashSet<String> embeddedBytesExcludeEmbeddedResourceTypes = new
HashSet<>();
private long maxEmbeddedBytesForExtraction =
DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION;
@Field
@@ -44,30 +43,24 @@ public class RUnpackExtractorFactory implements
EmbeddedDocumentByteStoreExtract
}
@Field
- public void setEmbeddedBytesIncludeMimeTypes(Set<String> includeMimeTypes)
{
- embeddedBytesIncludeMimeTypes = new HashSet<>();
- embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes);
+ public void setEmbeddedBytesIncludeMimeTypes(HashSet<String>
includeMimeTypes) {
+ embeddedBytesIncludeMimeTypes = new HashSet<>(includeMimeTypes);
}
@Field
- public void setEmbeddedBytesExcludeMimeTypes(Set<String> excludeMimeTypes)
{
- embeddedBytesExcludeMimeTypes = new HashSet<>();
- embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes);
+ public void setEmbeddedBytesExcludeMimeTypes(HashSet<String>
excludeMimeTypes) {
+ embeddedBytesExcludeMimeTypes = new HashSet<>(excludeMimeTypes);
}
@Field
- public void setEmbeddedBytesIncludeEmbeddedResourceTypes(Set<String>
includeAttachmentTypes) {
- embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>();
-
embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes);
-
+ public void setEmbeddedBytesIncludeEmbeddedResourceTypes(HashSet<String>
includeAttachmentTypes) {
+ embeddedBytesIncludeEmbeddedResourceTypes = new
HashSet<>(includeAttachmentTypes);
}
@Field
- public void setEmbeddedBytesExcludeEmbeddedResourceTypes(Set<String>
excludeAttachmentTypes) {
- embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>();
-
embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes);
-
+ public void setEmbeddedBytesExcludeEmbeddedResourceTypes(HashSet<String>
excludeAttachmentTypes) {
+ embeddedBytesExcludeEmbeddedResourceTypes = new
HashSet<>(excludeAttachmentTypes);
}
/**
@@ -91,19 +84,19 @@ public class RUnpackExtractorFactory implements
EmbeddedDocumentByteStoreExtract
return writeFileNameToContent;
}
- public Set<String> getEmbeddedBytesIncludeMimeTypes() {
+ public HashSet<String> getEmbeddedBytesIncludeMimeTypes() {
return embeddedBytesIncludeMimeTypes;
}
- public Set<String> getEmbeddedBytesExcludeMimeTypes() {
+ public HashSet<String> getEmbeddedBytesExcludeMimeTypes() {
return embeddedBytesExcludeMimeTypes;
}
- public Set<String> getEmbeddedBytesIncludeEmbeddedResourceTypes() {
+ public HashSet<String> getEmbeddedBytesIncludeEmbeddedResourceTypes() {
return embeddedBytesIncludeEmbeddedResourceTypes;
}
- public Set<String> getEmbeddedBytesExcludeEmbeddedResourceTypes() {
+ public HashSet<String> getEmbeddedBytesExcludeEmbeddedResourceTypes() {
return embeddedBytesExcludeEmbeddedResourceTypes;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
index 1acdf5700..b5d8a0288 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.metadata.writefilter;
-import java.util.Collections;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.HashSet;
/**
* Factory class for {@link StandardWriteFilter}. See that class
@@ -32,15 +30,16 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
public static int DEFAULT_MAX_VALUES_PER_FIELD = 10;
- private Set<String> includeFields = Collections.EMPTY_SET;
- private Set<String> excludeFields = Collections.EMPTY_SET;
+ //concrete classes here and in the setters/getters for the sake of Jackson
+ private HashSet<String> includeFields = new HashSet<>();
+ private HashSet<String> excludeFields = new HashSet<>();
private int maxKeySize = DEFAULT_MAX_KEY_SIZE;
private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE;
private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES;
private int maxValuesPerField = DEFAULT_MAX_VALUES_PER_FIELD;
private boolean includeEmpty = false;
- public MetadataWriteFilter newInstance() {
+ public synchronized MetadataWriteFilter newInstance() {
if (maxFieldSize < 0) {
throw new IllegalArgumentException("maxFieldSize must be > 0");
@@ -59,16 +58,12 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
excludeFields, includeEmpty);
}
- public void setIncludeFields(Set<String> includeFields) {
- Set<String> keys = ConcurrentHashMap.newKeySet(includeFields.size());
- keys.addAll(includeFields);
- this.includeFields = Collections.unmodifiableSet(keys);
+ public void setIncludeFields(HashSet<String> includeFields) {
+ this.includeFields = new HashSet<>(includeFields);
}
- public void setExcludeFields(Set<String> excludeFields) {
- Set<String> keys = ConcurrentHashMap.newKeySet(excludeFields.size());
- keys.addAll(excludeFields);
- this.excludeFields = Collections.unmodifiableSet(keys);
+ public void setExcludeFields(HashSet<String> excludeFields) {
+ this.excludeFields = new HashSet<>(excludeFields);
}
public void setMaxTotalEstimatedBytes(int maxTotalEstimatedBytes) {
@@ -91,11 +86,11 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
this.maxValuesPerField = maxValuesPerField;
}
- public Set<String> getIncludeFields() {
+ public HashSet<String> getIncludeFields() {
return includeFields;
}
- public Set<String> getExcludeFields() {
+ public HashSet<String> getExcludeFields() {
return excludeFields;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index b960194ab..895356404 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -77,6 +78,20 @@ public class AutoDetectParserConfigTest extends TikaTest {
assertContainsCount("15.9.2007 11:02", txt, 2);
}
+ @Test
+ public void testWriteFilter() throws Exception {
+ //test to make sure that the decorator is only applied once for
+ //legacy (e.g. not RecursiveParserWrapperHandler) parsing
+ Parser p =
TikaLoaderHelper.getLoader("tika-config-write-filter.json").loadAutoDetectParser();
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+ for (Metadata metadata : metadataList) {
+ for (String k : metadata.names()) {
+ assertTrue(k.startsWith("X-TIKA:") ||
k.startsWith("access_permission:")
+ || k.equals("Content-Type") || k.equals("dc:creator"));
+ }
+ }
+ }
+
@Test
public void testDigests() throws Exception {
//test to make sure that the decorator is only applied once for
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
index d573606b9..c19e5e3e6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
@@ -8,12 +8,8 @@
"metadataWriteFilterFactory": {
"@class":
"org.apache.tika.metadata.writefilter.StandardWriteFilterFactory",
"includeFields": [
- "java.util.Collections$EmptySet",
- []
],
"excludeFields": [
- "java.util.Collections$EmptySet",
- []
],
"maxKeySize": 1024,
"maxFieldSize": 102400,
@@ -25,20 +21,15 @@
"@class": "org.apache.tika.extractor.RUnpackExtractorFactory",
"writeFileNameToContent": true,
"embeddedBytesIncludeMimeTypes": [
- "java.util.Collections$EmptySet",
- []
+ "text/pdf"
],
"embeddedBytesExcludeMimeTypes": [
- "java.util.Collections$EmptySet",
- []
+ "rtf/application"
],
"embeddedBytesIncludeEmbeddedResourceTypes": [
- "java.util.Collections$EmptySet",
- []
+ "appended"
],
"embeddedBytesExcludeEmbeddedResourceTypes": [
- "java.util.Collections$EmptySet",
- []
],
"maxEmbeddedBytesForExtraction": 10737418240
},
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
new file mode 100644
index 000000000..d05a6e504
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
@@ -0,0 +1,21 @@
+{
+ "auto-detect-parser": {
+ "spoolToDisk": 1000000,
+ "outputThreshold": 1000000,
+ "digesterFactory": {
+ "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory",
+ "markLimit": 100000,
+ "algorithmString": "sha256:32,md5",
+ "skipContainerDocument": true
+ },
+ "metadataWriteFilterFactory": {
+ "@class":
"org.apache.tika.metadata.writefilter.StandardWriteFilterFactory",
+ "includeFields": [
+ "X-TIKA-CONTENT",
+ "dc:creator"
+ ]
+ },
+ "throwOnZeroBytes": false
+ }
+}
+