This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4207 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9ffc4df4a3d059d54e1e1851b8d024b24d2043f9 Author: tallison <talli...@apache.org> AuthorDate: Thu Mar 21 13:48:16 2024 -0400 TIKA-4207 -- allow users to configure include/exclude for attachment types and/or mime types --- .../tika/extractor/BasicEmbeddedBytesSelector.java | 77 ++++++++++++++++++++++ ...ctorFactory.java => EmbeddedBytesSelector.java} | 24 +++---- .../ParsingEmbeddedDocumentExtractor.java | 28 +++++++- .../ParsingEmbeddedDocumentExtractorFactory.java | 56 ++++++++++++++-- .../apache/tika/metadata/TikaCoreProperties.java | 4 ++ .../tika/parser/AutoDetectParserConfigTest.java | 72 ++++++++++++++++++++ .../config/TIKA-4207-embedded-bytes-config.xml | 38 +++++++++++ 7 files changed, 277 insertions(+), 22 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java new file mode 100644 index 000000000..1d5a239db --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.extractor; + +import java.util.Set; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.utils.StringUtils; + +public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector { + + + + private final Set<String> includeMimes; + private final Set<String> excludeMimes; + private final Set<String> includeEmbeddedResourceTypes; + + private final Set<String> excludeEmbeddedResourceTypes; + + public BasicEmbeddedBytesSelector(Set<String> includeMimes, Set<String> excludeMimes, + Set<String> includeEmbeddedResourceTypes, + Set<String> excludeEmbeddedResourceTypes) { + this.includeMimes = includeMimes; + this.excludeMimes = excludeMimes; + this.includeEmbeddedResourceTypes = includeEmbeddedResourceTypes; + this.excludeEmbeddedResourceTypes = excludeEmbeddedResourceTypes; + } + + public boolean select(Metadata metadata) { + String mime = metadata.get(Metadata.CONTENT_TYPE); + if (mime == null) { + mime = ""; + } else { + //if mime matters at all, make sure to get the mime without parameters + if (includeMimes.size() > 0 || excludeMimes.size() > 0) { + MediaType mt = MediaType.parse(mime); + if (mt != null) { + mime = mt.getType() + "/" + mt.getSubtype(); + } + } + } + if (excludeMimes.contains(mime)) { + return false; + } + if (includeMimes.size() > 0 && ! includeMimes.contains(mime)) { + return false; + } + String embeddedResourceType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + //if a parser doesn't specify the type, treat it as ATTACHMENT + embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" : + embeddedResourceType; + + if (excludeEmbeddedResourceTypes.contains(embeddedResourceType)) { + return false; + } + if (includeEmbeddedResourceTypes.size() > 0 && includeEmbeddedResourceTypes.contains(embeddedResourceType)) { + return true; + } + return false; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java similarity index 55% copy from tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java copy to tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java index 9136228c4..2ec7df667 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java @@ -16,25 +16,17 @@ */ package org.apache.tika.extractor; -import org.apache.tika.config.Field; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -public class ParsingEmbeddedDocumentExtractorFactory - implements EmbeddedDocumentExtractorFactory { +public interface EmbeddedBytesSelector { - private boolean writeFileNameToContent = true; - - @Field - public void setWriteFileNameToContent(boolean writeFileNameToContent) { - this.writeFileNameToContent = writeFileNameToContent; + class AcceptAll implements EmbeddedBytesSelector { + @Override + public boolean select(Metadata metadata) { + return true; + } } + EmbeddedBytesSelector ACCEPT_ALL = new AcceptAll(); - @Override - public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - ParsingEmbeddedDocumentExtractor ex = - new ParsingEmbeddedDocumentExtractor(parseContext); - ex.setWriteFileNameToContent(writeFileNameToContent); - return ex; - } + boolean select(Metadata metadata); } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java index 46672838b..ee15c1e22 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java @@ -26,6 +26,8 @@ import java.nio.file.Files; import java.nio.file.Path; import org.apache.commons.io.input.CloseShieldInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -43,6 +45,7 @@ import org.apache.tika.parser.ParseRecord; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.utils.ExceptionUtils; /** * Helper class for parsers of package archives or other compound document @@ -52,6 +55,9 @@ import org.apache.tika.sax.EmbeddedContentHandler; */ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor { + private static final Logger LOGGER = + LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class); + private static final File ABSTRACT_PATH = new File(""); private static final Parser DELEGATING_PARSER = new DelegatingParser(); @@ -60,6 +66,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract private final ParseContext context; + private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL; + public ParsingEmbeddedDocumentExtractor(ParseContext context) { this.context = context; } @@ -147,6 +155,14 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract } private void storeEmbeddedBytes(Path p, Metadata metadata) { + if (! embeddedBytesSelector.select(metadata)) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("skipping embedded bytes {} {}", + metadata.get(Metadata.CONTENT_TYPE), + metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } + return; + } EmbeddedDocumentByteStore embeddedDocumentByteStore = context.get(EmbeddedDocumentByteStore.class); int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID); @@ -154,8 +170,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract try { embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p)); } catch (IOException e) { - e.printStackTrace(); - //log, or better, store embdocstore exception + metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION, + ExceptionUtils.getStackTrace(e)); } } @@ -175,4 +191,12 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract public void setWriteFileNameToContent(boolean writeFileNameToContent) { this.writeFileNameToContent = writeFileNameToContent; } + + public void setEmbeddedBytesSelector(EmbeddedBytesSelector embeddedBytesSelector) { + this.embeddedBytesSelector = embeddedBytesSelector; + } + + public EmbeddedBytesSelector getEmbeddedBytesSelector() { + return embeddedBytesSelector; + } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java index 9136228c4..7632ed49c 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java @@ -16,25 +16,73 @@ */ package org.apache.tika.extractor; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + import org.apache.tika.config.Field; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -public class ParsingEmbeddedDocumentExtractorFactory - implements EmbeddedDocumentExtractorFactory { +public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory { private boolean writeFileNameToContent = true; + private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET; + private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET; + private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET; + private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET; @Field public void setWriteFileNameToContent(boolean writeFileNameToContent) { this.writeFileNameToContent = writeFileNameToContent; } + @Field + public void setEmbeddedBytesIncludeMimeTypes(List<String> includeMimeTypes) { + embeddedBytesIncludeMimeTypes = new HashSet<>(); + embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes); + } + + @Field + public void setEmbeddedBytesExcludeMimeTypes(List<String> excludeMimeTypes) { + embeddedBytesExcludeMimeTypes = new HashSet<>(); + embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes); + + } + + @Field + public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List<String> includeAttachmentTypes) { + embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(); + embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes); + + } + + @Field + public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List<String> excludeAttachmentTypes) { + embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(); + embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes); + + } + + @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - ParsingEmbeddedDocumentExtractor ex = - new ParsingEmbeddedDocumentExtractor(parseContext); + ParsingEmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(parseContext); ex.setWriteFileNameToContent(writeFileNameToContent); + ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector()); return ex; } + + private EmbeddedBytesSelector createEmbeddedBytesSelector() { + if (embeddedBytesIncludeMimeTypes.size() == 0 && + embeddedBytesExcludeMimeTypes.size() == 0 && + embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 && + embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) { + return EmbeddedBytesSelector.ACCEPT_ALL; + } + return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes, + embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes, + embeddedBytesExcludeEmbeddedResourceTypes); + } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index 6ff02c1cf..effa4a667 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -98,6 +98,10 @@ public interface TikaCoreProperties { Property EMBEDDED_EXCEPTION = Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception"); + //exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore + Property EMBEDDED_BYTES_EXCEPTION = + Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_bytes_exception"); + //warning while parsing in an embedded file Property EMBEDDED_WARNING = Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning"); diff --git a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java new file mode 100644 index 000000000..a0d5d4896 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.InputStream; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.extractor.EmbeddedBytesSelector; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.utils.StringUtils; + +public class AutoDetectParserConfigTest { + + @Test + public void testEmbeddedBytesSelector() throws Exception { + TikaConfig config; + try (InputStream is = TikaConfig.class.getResourceAsStream( + "TIKA-4207-embedded-bytes-config.xml")) { + config = new TikaConfig(is); + } + AutoDetectParserConfig c = config.getAutoDetectParserConfig(); + ParsingEmbeddedDocumentExtractorFactory f = + (ParsingEmbeddedDocumentExtractorFactory) c.getEmbeddedDocumentExtractorFactory(); + + Metadata metadata = new Metadata(); + ParseContext parseContext = new ParseContext(); + ParsingEmbeddedDocumentExtractor ex = (ParsingEmbeddedDocumentExtractor) f.newInstance(metadata, parseContext); + EmbeddedBytesSelector selector = ex.getEmbeddedBytesSelector(); + assertFalse(selector.select(getMetadata("", ""))); + assertTrue(selector.select(getMetadata("application/pdf", ""))); + assertTrue(selector.select(getMetadata("application/pdf", "ATTACHMENT"))); + assertTrue(selector.select(getMetadata("application/pdf", "INLINE"))); + assertTrue(selector.select(getMetadata("text/plain;charset=UTF-7", "INLINE"))); + + assertFalse(selector.select(getMetadata("application/pdf", "MACRO"))); + assertFalse(selector.select(getMetadata("application/docx", ""))); + + } + + private Metadata getMetadata(String mime, String embeddedResourceType) { + Metadata m = new Metadata(); + if (!StringUtils.isBlank(mime)) { + m.set(Metadata.CONTENT_TYPE, mime); + } + if (!StringUtils.isBlank(embeddedResourceType)) { + m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, embeddedResourceType); + } + return m; + } +} diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml new file mode 100644 index 000000000..d60c6b1ca --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + </parsers> + <autoDetectParserConfig> + <spoolToDisk>123450</spoolToDisk> + <outputThreshold>678900</outputThreshold> + <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory"> + <writeFileNameToContent>false</writeFileNameToContent> + <embeddedBytesIncludeMimeTypes> + <mime>application/pdf</mime> + <mime>application/rtf</mime> + <mime>text/plain</mime> + </embeddedBytesIncludeMimeTypes> + <embeddedBytesIncludeEmbeddedResourceTypes> + <type>ATTACHMENT</type> + <type>INLINE</type> + </embeddedBytesIncludeEmbeddedResourceTypes> + </embeddedDocumentExtractorFactory> + </autoDetectParserConfig> +</properties> \ No newline at end of file