This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4545-fix-adpc-component-serialization in repository https://gitbox.apache.org/repos/asf/tika.git
commit 733dfec6594b00cca4ed6e3c4ba3a8286f49b70f Author: tallison <[email protected]> AuthorDate: Thu Dec 4 15:54:31 2025 -0500 TIKA-4545 - swap in concrete classes to help Jackson --- .../tika/extractor/RUnpackExtractorFactory.java | 41 +++++++++------------- .../writefilter/StandardWriteFilterFactory.java | 27 ++++++-------- .../tika/parser/AutoDetectParserConfigTest.java | 15 ++++++++ ...a-config-upcasing-custom-handler-decorator.json | 15 ++------ .../configs/tika-config-write-filter.json | 21 +++++++++++ 5 files changed, 67 insertions(+), 52 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java index aed34f535..5813ed3ab 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java @@ -16,9 +16,7 @@ */ package org.apache.tika.extractor; -import java.util.Collections; import java.util.HashSet; -import java.util.Set; import org.apache.tika.config.Field; import org.apache.tika.config.TikaComponent; @@ -32,10 +30,11 @@ public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtract public static long DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION = 10l * 1024l * 1024l * 1024l; private boolean writeFileNameToContent = true; - private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET; - private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET; - private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET; - private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET; + //concrete HashSet class for the sake of Jackson + private HashSet<String> embeddedBytesIncludeMimeTypes = new HashSet<>(); + private HashSet<String> embeddedBytesExcludeMimeTypes = new HashSet<>(); + private HashSet<String> embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(); + private HashSet<String> embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(); private long maxEmbeddedBytesForExtraction = DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION; @Field @@ -44,30 +43,24 @@ public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtract } @Field - public void setEmbeddedBytesIncludeMimeTypes(Set<String> includeMimeTypes) { - embeddedBytesIncludeMimeTypes = new HashSet<>(); - embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes); + public void setEmbeddedBytesIncludeMimeTypes(HashSet<String> includeMimeTypes) { + embeddedBytesIncludeMimeTypes = new HashSet<>(includeMimeTypes); } @Field - public void setEmbeddedBytesExcludeMimeTypes(Set<String> excludeMimeTypes) { - embeddedBytesExcludeMimeTypes = new HashSet<>(); - embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes); + public void setEmbeddedBytesExcludeMimeTypes(HashSet<String> excludeMimeTypes) { + embeddedBytesExcludeMimeTypes = new HashSet<>(excludeMimeTypes); } @Field - public void setEmbeddedBytesIncludeEmbeddedResourceTypes(Set<String> includeAttachmentTypes) { - embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(); - embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes); - + public void setEmbeddedBytesIncludeEmbeddedResourceTypes(HashSet<String> includeAttachmentTypes) { + embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(includeAttachmentTypes); } @Field - public void setEmbeddedBytesExcludeEmbeddedResourceTypes(Set<String> excludeAttachmentTypes) { - embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(); - embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes); - + public void setEmbeddedBytesExcludeEmbeddedResourceTypes(HashSet<String> excludeAttachmentTypes) { + embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(excludeAttachmentTypes); } /** @@ -91,19 +84,19 @@ public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtract return writeFileNameToContent; } - public Set<String> getEmbeddedBytesIncludeMimeTypes() { + public HashSet<String> getEmbeddedBytesIncludeMimeTypes() { return embeddedBytesIncludeMimeTypes; } - public Set<String> getEmbeddedBytesExcludeMimeTypes() { + public HashSet<String> getEmbeddedBytesExcludeMimeTypes() { return embeddedBytesExcludeMimeTypes; } - public Set<String> getEmbeddedBytesIncludeEmbeddedResourceTypes() { + public HashSet<String> getEmbeddedBytesIncludeEmbeddedResourceTypes() { return embeddedBytesIncludeEmbeddedResourceTypes; } - public Set<String> getEmbeddedBytesExcludeEmbeddedResourceTypes() { + public HashSet<String> getEmbeddedBytesExcludeEmbeddedResourceTypes() { return embeddedBytesExcludeEmbeddedResourceTypes; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java index 1acdf5700..b5d8a0288 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java @@ -16,9 +16,7 @@ */ package org.apache.tika.metadata.writefilter; -import java.util.Collections; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; +import java.util.HashSet; /** * Factory class for {@link StandardWriteFilter}. See that class @@ -32,15 +30,16 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024; public static int DEFAULT_MAX_VALUES_PER_FIELD = 10; - private Set<String> includeFields = Collections.EMPTY_SET; - private Set<String> excludeFields = Collections.EMPTY_SET; + //concrete classes here and in the setters/getters for the sake of Jackson + private HashSet<String> includeFields = new HashSet<>(); + private HashSet<String> excludeFields = new HashSet<>(); private int maxKeySize = DEFAULT_MAX_KEY_SIZE; private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE; private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES; private int maxValuesPerField = DEFAULT_MAX_VALUES_PER_FIELD; private boolean includeEmpty = false; - public MetadataWriteFilter newInstance() { + public synchronized MetadataWriteFilter newInstance() { if (maxFieldSize < 0) { throw new IllegalArgumentException("maxFieldSize must be > 0"); @@ -59,16 +58,12 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { excludeFields, includeEmpty); } - public void setIncludeFields(Set<String> includeFields) { - Set<String> keys = ConcurrentHashMap.newKeySet(includeFields.size()); - keys.addAll(includeFields); - this.includeFields = Collections.unmodifiableSet(keys); + public void setIncludeFields(HashSet<String> includeFields) { + this.includeFields = new HashSet<>(includeFields); } - public void setExcludeFields(Set<String> excludeFields) { - Set<String> keys = ConcurrentHashMap.newKeySet(excludeFields.size()); - keys.addAll(excludeFields); - this.excludeFields = Collections.unmodifiableSet(keys); + public void setExcludeFields(HashSet<String> excludeFields) { + this.excludeFields = new HashSet<>(excludeFields); } public void setMaxTotalEstimatedBytes(int maxTotalEstimatedBytes) { @@ -91,11 +86,11 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { this.maxValuesPerField = maxValuesPerField; } - public Set<String> getIncludeFields() { + public HashSet<String> getIncludeFields() { return includeFields; } - public Set<String> getExcludeFields() { + public HashSet<String> getExcludeFields() { return excludeFields; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java index b960194ab..895356404 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java @@ -18,6 +18,7 @@ package org.apache.tika.parser; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.file.Files; import java.nio.file.Path; @@ -77,6 +78,20 @@ public class AutoDetectParserConfigTest extends TikaTest { assertContainsCount("15.9.2007 11:02", txt, 2); } + @Test + public void testWriteFilter() throws Exception { + //test to make sure that the decorator is only applied once for + //legacy (e.g. not RecursiveParserWrapperHandler) parsing + Parser p = TikaLoaderHelper.getLoader("tika-config-write-filter.json").loadAutoDetectParser(); + List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p); + for (Metadata metadata : metadataList) { + for (String k : metadata.names()) { + assertTrue(k.startsWith("X-TIKA:") || k.startsWith("access_permission:") + || k.equals("Content-Type") || k.equals("dc:creator")); + } + } + } + @Test public void testDigests() throws Exception { //test to make sure that the decorator is only applied once for diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json index d573606b9..c19e5e3e6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json @@ -8,12 +8,8 @@ "metadataWriteFilterFactory": { "@class": "org.apache.tika.metadata.writefilter.StandardWriteFilterFactory", "includeFields": [ - "java.util.Collections$EmptySet", - [] ], "excludeFields": [ - "java.util.Collections$EmptySet", - [] ], "maxKeySize": 1024, "maxFieldSize": 102400, @@ -25,20 +21,15 @@ "@class": "org.apache.tika.extractor.RUnpackExtractorFactory", "writeFileNameToContent": true, "embeddedBytesIncludeMimeTypes": [ - "java.util.Collections$EmptySet", - [] + "text/pdf" ], "embeddedBytesExcludeMimeTypes": [ - "java.util.Collections$EmptySet", - [] + "rtf/application" ], "embeddedBytesIncludeEmbeddedResourceTypes": [ - "java.util.Collections$EmptySet", - [] + "appended" ], "embeddedBytesExcludeEmbeddedResourceTypes": [ - "java.util.Collections$EmptySet", - [] ], "maxEmbeddedBytesForExtraction": 10737418240 }, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json new file mode 100644 index 000000000..d05a6e504 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json @@ -0,0 +1,21 @@ +{ + "auto-detect-parser": { + "spoolToDisk": 1000000, + "outputThreshold": 1000000, + "digesterFactory": { + "@class": "org.apache.tika.parser.digestutils.CommonsDigesterFactory", + "markLimit": 100000, + "algorithmString": "sha256:32,md5", + "skipContainerDocument": true + }, + "metadataWriteFilterFactory": { + "@class": "org.apache.tika.metadata.writefilter.StandardWriteFilterFactory", + "includeFields": [ + "X-TIKA-CONTENT", + "dc:creator" + ] + }, + "throwOnZeroBytes": false + } +} +
