This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4261
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 8e83de8d8e6914db8fe5c93bc7a6a83c79e80f4d
Author: tallison <[email protected]>
AuthorDate: Fri May 24 14:42:42 2024 -0400

    TIKA-4261 -- add a clear by attachment type metadata filter
---
 .../ClearByAttachmentTypeMetadataFilter.java       | 85 ++++++++++++++++++++++
 .../tika/metadata/filter/TestMetadataFilter.java   | 19 +++++
 .../config/TIKA-4261-clear-by-embedded-type.xml    | 27 +++++++
 3 files changed, 131 insertions(+)

diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
 
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
new file mode 100644
index 000000000..1d6825674
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * This class clears the entire metadata object if the
+ * attachment type matches one of the types.  The idea is that you might not 
want
+ * to store/transmit metadata for images or specific file types.
+ */
+public class ClearByAttachmentTypeMetadataFilter extends MetadataFilter {
+    private final Set<String> types;
+
+    public ClearByAttachmentTypeMetadataFilter() {
+        this(new HashSet<>());
+    }
+
+    public ClearByAttachmentTypeMetadataFilter(Set<String> types) {
+        this.types = types;
+    }
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        String type = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+        if (type == null) {
+            return;
+        }
+        if (! types.contains(type)) {
+            return;
+        }
+        for (String n : metadata.names()) {
+            metadata.remove(n);
+        }
+    }
+
+    /**
+     * For types see {@link TikaCoreProperties.EmbeddedResourceType}
+     *
+     * @param types attachment types that should be deleted.
+     * @throws TikaConfigException
+     */
+    @Field
+    public void setTypes(List<String> types) throws TikaConfigException {
+        for (String t : types) {
+            try {
+                TikaCoreProperties.EmbeddedResourceType.valueOf(t);
+            } catch (IllegalArgumentException e) {
+                StringBuilder sb = new StringBuilder();
+                int i = 0;
+                for (TikaCoreProperties.EmbeddedResourceType type : 
TikaCoreProperties.EmbeddedResourceType.values()) {
+                    if (i++ > 0) {
+                        sb.append(", ");
+                    }
+                    sb.append(type.name());
+                }
+                throw new TikaConfigException("I'm sorry. I regret I don't 
recognise " + t +
+                        ". I do recognize the following (case-sensitive):" + 
sb.toString());
+            }
+        }
+        this.types.addAll(types);
+    }
+}
diff --git 
a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
 
b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index 0b071d0be..91e4bd3be 100644
--- 
a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++ 
b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -244,4 +244,23 @@ public class TestMetadataFilter extends 
AbstractTikaConfigTest {
         assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
     }
 
+    @Test
+    public void testAttachmentTypeMetadataFilter() throws Exception {
+        TikaConfig config = getConfig("TIKA-4261-clear-by-embedded-type.xml");
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 
TikaCoreProperties.EmbeddedResourceType.INLINE.name());
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+
+        MetadataFilter filter = config.getMetadataFilter();
+        filter.filter(metadata);
+        assertEquals(0, metadata.names().length);
+
+        metadata = new Metadata();
+        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 
TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK
+                .name());
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+        filter.filter(metadata);
+        assertEquals(2, metadata.names().length);
+    }
+
 }
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml
 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml
new file mode 100644
index 000000000..4f5aa6dbd
--- /dev/null
+++ 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter 
class="org.apache.tika.metadata.filter.ClearByAttachmentTypeMetadataFilter">
+      <types>
+        <type>INLINE</type>
+        <type>ATTACHMENT</type>
+      </types>
+    </metadataFilter>
+  </metadataFilters>
+</properties>

Reply via email to