This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4086a46cece50e5673cc7f90b984d6ab0f598aa7 Author: Tim Allison <[email protected]> AuthorDate: Thu Nov 14 15:24:45 2024 -0500 TIKA-4352 -- add an exclusion list in the StandardWriteFilter (#2046) (cherry picked from commit 5a3a7d2bb434de6ef650c950e2d90d005f388f75) --- .../metadata/writefilter/StandardWriteFilter.java | 12 ++++--- .../writefilter/StandardWriteFilterFactory.java | 12 +++++-- .../writefilter/StandardWriteFilterTest.java | 42 ++++++++++++++++++---- .../org/apache/tika/config/TIKA-3695-exclude.xml | 35 ++++++++++++++++++ 4 files changed, 88 insertions(+), 13 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java index f0e9f1fe6..a245e8d2c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java @@ -113,6 +113,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { private final Set<String> includeFields; + private final Set<String> excludeFields; private Map<String, Integer> fieldSizes = new HashMap<>(); @@ -125,12 +126,14 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { * @param maxEstimatedSize * @param includeFields if null or empty, all fields are included; otherwise, which fields * to add to the metadata object. + * @param excludeFields these fields will not be included (unless they're in {@link StandardWriteFilter#ALWAYS_SET_FIELDS}) * @param includeEmpty if <code>true</code>, this will set or add an empty value to the * metadata object. */ protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize, int maxValuesPerField, Set<String> includeFields, + Set<String> excludeFields, boolean includeEmpty) { this.maxKeySize = maxKeySize; @@ -138,6 +141,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { this.maxTotalEstimatedSize = maxEstimatedSize; this.maxValuesPerField = maxValuesPerField; this.includeFields = includeFields; + this.excludeFields = excludeFields; this.includeEmpty = includeEmpty; } @@ -176,6 +180,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { setAlwaysInclude(field, value, data); return; } + StringSizePair filterKey = filterKey(field, value, data); setFilterKey(filterKey, value, data); } @@ -433,11 +438,10 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { if (ALWAYS_SET_FIELDS.contains(name)) { return true; } - if (includeFields == null || - includeFields.contains(name)) { - return true; + if (excludeFields.contains(name)) { + return false; } - return false; + return includeFields.isEmpty() || includeFields.contains(name); } private static int estimateSize(String s) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java index b7d60b540..df6d8b42d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java @@ -33,7 +33,8 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024; public static int DEFAULT_MAX_VALUES_PER_FIELD = 10; - private Set<String> includeFields = null; + private Set<String> includeFields = Collections.EMPTY_SET; + private Set<String> excludeFields = Collections.EMPTY_SET; private int maxKeySize = DEFAULT_MAX_KEY_SIZE; private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE; private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES; @@ -55,7 +56,8 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { } return new StandardWriteFilter(maxKeySize, maxFieldSize, - maxTotalEstimatedBytes, maxValuesPerField, includeFields, includeEmpty); + maxTotalEstimatedBytes, maxValuesPerField, includeFields, + excludeFields, includeEmpty); } public void setIncludeFields(List<String> includeFields) { @@ -64,6 +66,12 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { this.includeFields = Collections.unmodifiableSet(keys); } + public void setExcludeFields(List<String> excludeFields) { + Set<String> keys = ConcurrentHashMap.newKeySet(excludeFields.size()); + keys.addAll(excludeFields); + this.excludeFields = Collections.unmodifiableSet(keys); + } + public void setMaxTotalEstimatedBytes(int maxTotalEstimatedBytes) { this.maxTotalEstimatedBytes = maxTotalEstimatedBytes; } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java index 7b7e8710d..7c3369bfd 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java @@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; +import java.util.Collections; import java.util.List; import java.util.Set; @@ -116,7 +117,7 @@ public class StandardWriteFilterTest extends TikaTest { @Test public void testKeySizeFilter() throws Exception { Metadata metadata = filter(10, 1000, 10000, 100, - null, true); + Collections.EMPTY_SET, Collections.EMPTY_SET, true); //test that must add keys are not truncated metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1"); metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2"); @@ -138,13 +139,13 @@ public class StandardWriteFilterTest extends TikaTest { String k = "dc:creator";//20 bytes //key is > maxTotalBytes, so the value isn't even added Metadata metadata = filter(100, 10000, 10, - 100, null, false); + 100, Collections.EMPTY_SET, Collections.EMPTY_SET, false); metadata.set(k, "ab"); assertEquals(1, metadata.names().length); assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); metadata = filter(100, 10000, 50, 100, - null, false); + Collections.EMPTY_SET, Collections.EMPTY_SET, false); for (int i = 0; i < 10; i++) { metadata.set(k, "abcde"); } @@ -178,7 +179,8 @@ public class StandardWriteFilterTest extends TikaTest { @Test public void testMinSizeForAlwaysInclude() throws Exception { //test that mimes don't get truncated - Metadata metadata = filter(100, 10, 10000, 100, null, true); + Metadata metadata = filter(100, 10, 10000, 100, + Collections.EMPTY_SET, Collections.EMPTY_SET, true); String mime = getLongestMime().toString(); metadata.set(Metadata.CONTENT_TYPE, mime); @@ -192,21 +194,47 @@ public class StandardWriteFilterTest extends TikaTest { @Test public void testMaxFieldValues() throws Exception { - Metadata metadata = filter(100, 10000, 10000, 3, null, true); + Metadata metadata = filter(100, 10000, 10000, 3, + Collections.EMPTY_SET, Collections.EMPTY_SET, true); for (int i = 0; i < 10; i++) { metadata.add(TikaCoreProperties.SUBJECT, "ab"); } assertEquals(3, metadata.getValues(TikaCoreProperties.SUBJECT).length); } + @Test + public void testExclude() throws Exception { + TikaConfig tikaConfig = + new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-exclude.xml")); + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + + "<mock>"; + mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>"; + mock += "<metadata action=\"add\" name=\"subject\">01234567890123456789</metadata>"; + mock += "<metadata action=\"add\" name=\"subjectB\">01234567890123456789</metadata>"; + mock += "<write element=\"p\" times=\"1\"> hello </write>\n"; + mock += "</mock>"; + Metadata metadata = new Metadata(); + List<Metadata> metadataList = + getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), + parser, metadata, new ParseContext(), true); + assertEquals(1, metadataList.size()); + metadata = metadataList.get(0); + assertEquals(9, metadata.names().length); + assertEquals("01234567890123456789", metadata.get("dc:creator")); + assertEquals("01234567890123456789", metadata.get("subjectB")); + assertNull(metadata.get("subject")); + } + + private void assertTruncated(Metadata metadata) { assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); } private Metadata filter(int maxKeySize, int maxFieldSize, int maxTotalBytes, int maxValuesPerField, - Set<String> includeFields, boolean includeEmpty) { + Set<String> includeFields, Set<String> excludeFields, boolean includeEmpty) { MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, maxFieldSize, - maxTotalBytes, maxValuesPerField, includeFields, includeEmpty); + maxTotalBytes, maxValuesPerField, includeFields, excludeFields, includeEmpty); Metadata metadata = new Metadata(); metadata.setMetadataWriteFilter(filter); return metadata; diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml new file mode 100644 index 000000000..974b43678 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml @@ -0,0 +1,35 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + </parsers> + <autoDetectParserConfig> + <params> + <spoolToDisk>12345</spoolToDisk> + <outputThreshold>6789</outputThreshold> + </params> + <metadataWriteFilterFactory class="org.apache.tika.metadata.writefilter.StandardWriteFilterFactory"> + <params> + <excludeFields> + <field>subject</field> + </excludeFields> + </params> + </metadataWriteFilterFactory> + </autoDetectParserConfig> +</properties>
