This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new c1f07222b TIKA-4261 -- add a clear by attachment type metadata filter
(#1777)
c1f07222b is described below
commit c1f07222b147ce778eae5d9fef349a84939965e5
Author: Tim Allison <[email protected]>
AuthorDate: Fri May 24 15:16:43 2024 -0400
TIKA-4261 -- add a clear by attachment type metadata filter (#1777)
---
.../ClearByAttachmentTypeMetadataFilter.java | 85 ++++++++++++++++++++++
.../tika/metadata/filter/TestMetadataFilter.java | 19 +++++
.../config/TIKA-4261-clear-by-embedded-type.xml | 27 +++++++
3 files changed, 131 insertions(+)
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
new file mode 100644
index 000000000..1d6825674
--- /dev/null
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * This class clears the entire metadata object if the
+ * attachment type matches one of the types. The idea is that you might not
want
+ * to store/transmit metadata for images or specific file types.
+ */
+public class ClearByAttachmentTypeMetadataFilter extends MetadataFilter {
+ private final Set<String> types;
+
+ public ClearByAttachmentTypeMetadataFilter() {
+ this(new HashSet<>());
+ }
+
+ public ClearByAttachmentTypeMetadataFilter(Set<String> types) {
+ this.types = types;
+ }
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ String type = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+ if (type == null) {
+ return;
+ }
+ if (! types.contains(type)) {
+ return;
+ }
+ for (String n : metadata.names()) {
+ metadata.remove(n);
+ }
+ }
+
+ /**
+ * For types see {@link TikaCoreProperties.EmbeddedResourceType}
+ *
+ * @param types attachment types that should be deleted.
+ * @throws TikaConfigException
+ */
+ @Field
+ public void setTypes(List<String> types) throws TikaConfigException {
+ for (String t : types) {
+ try {
+ TikaCoreProperties.EmbeddedResourceType.valueOf(t);
+ } catch (IllegalArgumentException e) {
+ StringBuilder sb = new StringBuilder();
+ int i = 0;
+ for (TikaCoreProperties.EmbeddedResourceType type :
TikaCoreProperties.EmbeddedResourceType.values()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(type.name());
+ }
+ throw new TikaConfigException("I'm sorry. I regret I don't
recognise " + t +
+ ". I do recognize the following (case-sensitive):" +
sb.toString());
+ }
+ }
+ this.types.addAll(types);
+ }
+}
diff --git
a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index 0b071d0be..91e4bd3be 100644
---
a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++
b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -244,4 +244,23 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
}
+ @Test
+ public void testAttachmentTypeMetadataFilter() throws Exception {
+ TikaConfig config = getConfig("TIKA-4261-clear-by-embedded-type.xml");
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.name());
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+
+ MetadataFilter filter = config.getMetadataFilter();
+ filter.filter(metadata);
+ assertEquals(0, metadata.names().length);
+
+ metadata = new Metadata();
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK
+ .name());
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+ filter.filter(metadata);
+ assertEquals(2, metadata.names().length);
+ }
+
}
diff --git
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml
new file mode 100644
index 000000000..4f5aa6dbd
--- /dev/null
+++
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter
class="org.apache.tika.metadata.filter.ClearByAttachmentTypeMetadataFilter">
+ <types>
+ <type>INLINE</type>
+ <type>ATTACHMENT</type>
+ </types>
+ </metadataFilter>
+ </metadataFilters>
+</properties>