This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5a3a7d2bb TIKA-4352 -- add an exclusion list in the
StandardWriteFilter (#2046)
5a3a7d2bb is described below
commit 5a3a7d2bb434de6ef650c950e2d90d005f388f75
Author: Tim Allison <[email protected]>
AuthorDate: Thu Nov 14 15:24:45 2024 -0500
TIKA-4352 -- add an exclusion list in the StandardWriteFilter (#2046)
---
.../metadata/writefilter/StandardWriteFilter.java | 12 ++++---
.../writefilter/StandardWriteFilterFactory.java | 12 +++++--
.../writefilter/StandardWriteFilterTest.java | 42 ++++++++++++++++++----
.../org/apache/tika/config/TIKA-3695-exclude.xml | 35 ++++++++++++++++++
4 files changed, 88 insertions(+), 13 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
index f0e9f1fe6..a245e8d2c 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
@@ -113,6 +113,7 @@ public class StandardWriteFilter implements
MetadataWriteFilter, Serializable {
private final Set<String> includeFields;
+ private final Set<String> excludeFields;
private Map<String, Integer> fieldSizes = new HashMap<>();
@@ -125,12 +126,14 @@ public class StandardWriteFilter implements
MetadataWriteFilter, Serializable {
* @param maxEstimatedSize
* @param includeFields if null or empty, all fields are included;
otherwise, which fields
* to add to the metadata object.
+ * @param excludeFields these fields will not be included (unless they're
in {@link StandardWriteFilter#ALWAYS_SET_FIELDS})
* @param includeEmpty if <code>true</code>, this will set or add an empty
value to the
* metadata object.
*/
protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int
maxEstimatedSize,
int maxValuesPerField,
Set<String> includeFields,
+ Set<String> excludeFields,
boolean includeEmpty) {
this.maxKeySize = maxKeySize;
@@ -138,6 +141,7 @@ public class StandardWriteFilter implements
MetadataWriteFilter, Serializable {
this.maxTotalEstimatedSize = maxEstimatedSize;
this.maxValuesPerField = maxValuesPerField;
this.includeFields = includeFields;
+ this.excludeFields = excludeFields;
this.includeEmpty = includeEmpty;
}
@@ -176,6 +180,7 @@ public class StandardWriteFilter implements
MetadataWriteFilter, Serializable {
setAlwaysInclude(field, value, data);
return;
}
+
StringSizePair filterKey = filterKey(field, value, data);
setFilterKey(filterKey, value, data);
}
@@ -433,11 +438,10 @@ public class StandardWriteFilter implements
MetadataWriteFilter, Serializable {
if (ALWAYS_SET_FIELDS.contains(name)) {
return true;
}
- if (includeFields == null ||
- includeFields.contains(name)) {
- return true;
+ if (excludeFields.contains(name)) {
+ return false;
}
- return false;
+ return includeFields.isEmpty() || includeFields.contains(name);
}
private static int estimateSize(String s) {
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
index b7d60b540..df6d8b42d 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
@@ -33,7 +33,8 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
public static int DEFAULT_MAX_VALUES_PER_FIELD = 10;
- private Set<String> includeFields = null;
+ private Set<String> includeFields = Collections.EMPTY_SET;
+ private Set<String> excludeFields = Collections.EMPTY_SET;
private int maxKeySize = DEFAULT_MAX_KEY_SIZE;
private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE;
private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES;
@@ -55,7 +56,8 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
}
return new StandardWriteFilter(maxKeySize, maxFieldSize,
- maxTotalEstimatedBytes, maxValuesPerField, includeFields,
includeEmpty);
+ maxTotalEstimatedBytes, maxValuesPerField, includeFields,
+ excludeFields, includeEmpty);
}
public void setIncludeFields(List<String> includeFields) {
@@ -64,6 +66,12 @@ public class StandardWriteFilterFactory implements
MetadataWriteFilterFactory {
this.includeFields = Collections.unmodifiableSet(keys);
}
+ public void setExcludeFields(List<String> excludeFields) {
+ Set<String> keys = ConcurrentHashMap.newKeySet(excludeFields.size());
+ keys.addAll(excludeFields);
+ this.excludeFields = Collections.unmodifiableSet(keys);
+ }
+
public void setMaxTotalEstimatedBytes(int maxTotalEstimatedBytes) {
this.maxTotalEstimatedBytes = maxTotalEstimatedBytes;
}
diff --git
a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
index 7b7e8710d..7c3369bfd 100644
---
a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
+++
b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
@@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertNull;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
+import java.util.Collections;
import java.util.List;
import java.util.Set;
@@ -116,7 +117,7 @@ public class StandardWriteFilterTest extends TikaTest {
@Test
public void testKeySizeFilter() throws Exception {
Metadata metadata = filter(10, 1000, 10000, 100,
- null, true);
+ Collections.EMPTY_SET, Collections.EMPTY_SET, true);
//test that must add keys are not truncated
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1");
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2");
@@ -138,13 +139,13 @@ public class StandardWriteFilterTest extends TikaTest {
String k = "dc:creator";//20 bytes
//key is > maxTotalBytes, so the value isn't even added
Metadata metadata = filter(100, 10000, 10,
- 100, null, false);
+ 100, Collections.EMPTY_SET, Collections.EMPTY_SET, false);
metadata.set(k, "ab");
assertEquals(1, metadata.names().length);
assertEquals("true",
metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
metadata = filter(100, 10000, 50, 100,
- null, false);
+ Collections.EMPTY_SET, Collections.EMPTY_SET, false);
for (int i = 0; i < 10; i++) {
metadata.set(k, "abcde");
}
@@ -178,7 +179,8 @@ public class StandardWriteFilterTest extends TikaTest {
@Test
public void testMinSizeForAlwaysInclude() throws Exception {
//test that mimes don't get truncated
- Metadata metadata = filter(100, 10, 10000, 100, null, true);
+ Metadata metadata = filter(100, 10, 10000, 100,
+ Collections.EMPTY_SET, Collections.EMPTY_SET, true);
String mime = getLongestMime().toString();
metadata.set(Metadata.CONTENT_TYPE, mime);
@@ -192,21 +194,47 @@ public class StandardWriteFilterTest extends TikaTest {
@Test
public void testMaxFieldValues() throws Exception {
- Metadata metadata = filter(100, 10000, 10000, 3, null, true);
+ Metadata metadata = filter(100, 10000, 10000, 3,
+ Collections.EMPTY_SET, Collections.EMPTY_SET, true);
for (int i = 0; i < 10; i++) {
metadata.add(TikaCoreProperties.SUBJECT, "ab");
}
assertEquals(3, metadata.getValues(TikaCoreProperties.SUBJECT).length);
}
+ @Test
+ public void testExclude() throws Exception {
+ TikaConfig tikaConfig =
+ new
TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-exclude.xml"));
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+ "<mock>";
+ mock += "<metadata action=\"add\"
name=\"dc:creator\">01234567890123456789</metadata>";
+ mock += "<metadata action=\"add\"
name=\"subject\">01234567890123456789</metadata>";
+ mock += "<metadata action=\"add\"
name=\"subjectB\">01234567890123456789</metadata>";
+ mock += "<write element=\"p\" times=\"1\"> hello </write>\n";
+ mock += "</mock>";
+ Metadata metadata = new Metadata();
+ List<Metadata> metadataList =
+ getRecursiveMetadata(new
ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+ parser, metadata, new ParseContext(), true);
+ assertEquals(1, metadataList.size());
+ metadata = metadataList.get(0);
+ assertEquals(9, metadata.names().length);
+ assertEquals("01234567890123456789", metadata.get("dc:creator"));
+ assertEquals("01234567890123456789", metadata.get("subjectB"));
+ assertNull(metadata.get("subject"));
+ }
+
+
private void assertTruncated(Metadata metadata) {
assertEquals("true",
metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
}
private Metadata filter(int maxKeySize, int maxFieldSize, int
maxTotalBytes,
int maxValuesPerField,
- Set<String> includeFields, boolean includeEmpty) {
+ Set<String> includeFields, Set<String>
excludeFields, boolean includeEmpty) {
MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize,
maxFieldSize,
- maxTotalBytes, maxValuesPerField, includeFields, includeEmpty);
+ maxTotalBytes, maxValuesPerField, includeFields,
excludeFields, includeEmpty);
Metadata metadata = new Metadata();
metadata.setMetadataWriteFilter(filter);
return metadata;
diff --git
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml
new file mode 100644
index 000000000..974b43678
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-exclude.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <autoDetectParserConfig>
+ <params>
+ <spoolToDisk>12345</spoolToDisk>
+ <outputThreshold>6789</outputThreshold>
+ </params>
+ <metadataWriteFilterFactory
class="org.apache.tika.metadata.writefilter.StandardWriteFilterFactory">
+ <params>
+ <excludeFields>
+ <field>subject</field>
+ </excludeFields>
+ </params>
+ </metadataWriteFilterFactory>
+ </autoDetectParserConfig>
+</properties>