This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4261 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8e83de8d8e6914db8fe5c93bc7a6a83c79e80f4d Author: tallison <[email protected]> AuthorDate: Fri May 24 14:42:42 2024 -0400 TIKA-4261 -- add a clear by attachment type metadata filter --- .../ClearByAttachmentTypeMetadataFilter.java | 85 ++++++++++++++++++++++ .../tika/metadata/filter/TestMetadataFilter.java | 19 +++++ .../config/TIKA-4261-clear-by-embedded-type.xml | 27 +++++++ 3 files changed, 131 insertions(+) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java new file mode 100644 index 000000000..1d6825674 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.tika.config.Field; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; + +/** + * This class clears the entire metadata object if the + * attachment type matches one of the types. The idea is that you might not want + * to store/transmit metadata for images or specific file types. + */ +public class ClearByAttachmentTypeMetadataFilter extends MetadataFilter { + private final Set<String> types; + + public ClearByAttachmentTypeMetadataFilter() { + this(new HashSet<>()); + } + + public ClearByAttachmentTypeMetadataFilter(Set<String> types) { + this.types = types; + } + + @Override + public void filter(Metadata metadata) throws TikaException { + String type = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (type == null) { + return; + } + if (! types.contains(type)) { + return; + } + for (String n : metadata.names()) { + metadata.remove(n); + } + } + + /** + * For types see {@link TikaCoreProperties.EmbeddedResourceType} + * + * @param types attachment types that should be deleted. + * @throws TikaConfigException + */ + @Field + public void setTypes(List<String> types) throws TikaConfigException { + for (String t : types) { + try { + TikaCoreProperties.EmbeddedResourceType.valueOf(t); + } catch (IllegalArgumentException e) { + StringBuilder sb = new StringBuilder(); + int i = 0; + for (TikaCoreProperties.EmbeddedResourceType type : TikaCoreProperties.EmbeddedResourceType.values()) { + if (i++ > 0) { + sb.append(", "); + } + sb.append(type.name()); + } + throw new TikaConfigException("I'm sorry. I regret I don't recognise " + t + + ". I do recognize the following (case-sensitive):" + sb.toString()); + } + } + this.types.addAll(types); + } +} diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java index 0b071d0be..91e4bd3be 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java @@ -244,4 +244,23 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE)); } + @Test + public void testAttachmentTypeMetadataFilter() throws Exception { + TikaConfig config = getConfig("TIKA-4261-clear-by-embedded-type.xml"); + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.name()); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); + + MetadataFilter filter = config.getMetadataFilter(); + filter.filter(metadata); + assertEquals(0, metadata.names().length); + + metadata = new Metadata(); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK + .name()); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); + filter.filter(metadata); + assertEquals(2, metadata.names().length); + } + } diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml new file mode 100644 index 000000000..4f5aa6dbd --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4261-clear-by-embedded-type.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.ClearByAttachmentTypeMetadataFilter"> + <types> + <type>INLINE</type> + <type>ATTACHMENT</type> + </types> + </metadataFilter> + </metadataFilters> +</properties>
