This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4133 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 79e8abb02fd68f17f5f3ad5ba2e0fccc4f974507 Author: tballison <talli...@apache.org> AuthorDate: Fri Sep 15 12:32:04 2023 -0400 TIKA-4133 -- add a capture group metadata filter --- .../filter/CaptureGroupMetadataFilter.java | 110 +++++++++++++++++++++ .../tika/metadata/filter/TestMetadataFilter.java | 54 ++++++++++ .../config/TIKA-4133-capture-group-overwrite.xml | 26 +++++ .../apache/tika/config/TIKA-4133-capture-group.xml | 26 +++++ 4 files changed, 216 insertions(+) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java new file mode 100644 index 000000000..ca9b1e6ea --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.utils.StringUtils; + + +/** + * This filter runs a regex against the first value in the "sourceField". + * If the pattern matches, it extracts the first group of the first match and + * set's the "targetField"'s value to that first group. + * <p/> + * If there is a match, this will overwrite whatever value is in the + * "targetField". + * <p/> + * If there is not a match, this filter will be a no-op. + * <p/> + * If there are multiple matches, this filter will capture only the first. + * Open a ticket if you need different behavior. + * <p/> + * If the source field has multiple values, this will run the regex + * against only the first value. + * <p/> + * If the source field does not exist, this filter will be a no-op. + * <p/> + * If the target field is the same value as the source field, this filter + * will overwrite the value in that field. Again, if there are multiple + * values in that field, those will all be overwritten. + */ +public class CaptureGroupMetadataFilter extends MetadataFilter implements Initializable { + + private String regexString; + private Pattern regex; + private String sourceField; + private String targetField; + + @Override + public void filter(Metadata metadata) throws TikaException { + String val = metadata.get(sourceField); + if (StringUtils.isBlank(val)) { + return; + } + Matcher m = regex.matcher(val); + if (m.find()) { + metadata.set(targetField, m.group(1)); + } + } + + @Field + public void setRegex(String regex) { + this.regexString = regex; + } + + @Field + public void setSourceField(String sourceField) { + this.sourceField = sourceField; + } + + @Field + public void setTargetField(String targetField) { + this.targetField = targetField; + } + + @Override + public void initialize(Map<String, Param> params) throws TikaConfigException { + try { + regex = Pattern.compile(regexString); + } catch (PatternSyntaxException e) { + throw new TikaConfigException("Couldn't parse regex", e); + } + + } + + @Override + public void checkInitialization(InitializableProblemHandler problemHandler) + throws TikaConfigException { + if (StringUtils.isBlank(sourceField)) { + throw new TikaConfigException("Must specify a 'sourceField'"); + } + if (StringUtils.isBlank(targetField)) { + throw new TikaConfigException("Must specify a 'targetField'"); + } + } +} diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java index 88d510d57..e3f01f4f1 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java @@ -30,6 +30,7 @@ import org.junit.jupiter.api.Test; import org.apache.tika.config.AbstractTikaConfigTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TIFF; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -191,4 +192,57 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { filter.filter(m); assertEquals("2021-07-23T08:02:24Z", m.get(TikaCoreProperties.CREATED)); } + + @Test + public void testCaptureGroupBasic() throws Exception { + TikaConfig config = getConfig("TIKA-4133-capture-group.xml"); + + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox"); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); + + MetadataFilter filter = config.getMetadataFilter(); + filter.filter(metadata); + assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals("text/html", metadata.get("mime")); + } + + @Test + public void testCaptureGroupNoSemiColon() throws Exception { + TikaConfig config = getConfig("TIKA-4133-capture-group.xml"); + + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox"); + metadata.set(Metadata.CONTENT_TYPE, "text/html"); + + MetadataFilter filter = config.getMetadataFilter(); + filter.filter(metadata); + assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals("text/html", metadata.get("mime")); + } + + @Test + public void testCaptureGroupOverwrite() throws Exception { + TikaConfig config = getConfig("TIKA-4133-capture-group-overwrite.xml"); + + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox"); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); + + MetadataFilter filter = config.getMetadataFilter(); + filter.filter(metadata); + assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE)); + + // now test that a single match overwrites all the values + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); + metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "text/html; charset=UTF-8"); + metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "text/plain; charset=UTF-8"); + metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "application/pdf; charset=UTF-8"); + + filter.filter(metadata); + assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); + assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE)); + } + } diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml new file mode 100644 index 000000000..b43655840 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.CaptureGroupMetadataFilter"> + <sourceField>Content-Type</sourceField> + <targetField>Content-Type</targetField> + <regex>\A([^;]+)</regex> + </metadataFilter> + </metadataFilters> +</properties> diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml new file mode 100644 index 000000000..7ad7378e0 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.CaptureGroupMetadataFilter"> + <sourceField>Content-Type</sourceField> + <targetField>mime</targetField> + <regex>\A([^;]+)</regex> + </metadataFilter> + </metadataFilters> +</properties>