This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9ffc4df4a3d059d54e1e1851b8d024b24d2043f9
Author: tallison <talli...@apache.org>
AuthorDate: Thu Mar 21 13:48:16 2024 -0400

    TIKA-4207 -- allow users to configure include/exclude for attachment types 
and/or mime types
---
 .../tika/extractor/BasicEmbeddedBytesSelector.java | 77 ++++++++++++++++++++++
 ...ctorFactory.java => EmbeddedBytesSelector.java} | 24 +++----
 .../ParsingEmbeddedDocumentExtractor.java          | 28 +++++++-
 .../ParsingEmbeddedDocumentExtractorFactory.java   | 56 ++++++++++++++--
 .../apache/tika/metadata/TikaCoreProperties.java   |  4 ++
 .../tika/parser/AutoDetectParserConfigTest.java    | 72 ++++++++++++++++++++
 .../config/TIKA-4207-embedded-bytes-config.xml     | 38 +++++++++++
 7 files changed, 277 insertions(+), 22 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
new file mode 100644
index 000000000..1d5a239db
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.StringUtils;
+
+public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector {
+
+
+
+    private final Set<String> includeMimes;
+    private final Set<String> excludeMimes;
+    private final Set<String> includeEmbeddedResourceTypes;
+
+    private final Set<String> excludeEmbeddedResourceTypes;
+
+    public BasicEmbeddedBytesSelector(Set<String> includeMimes, Set<String> 
excludeMimes,
+                                      Set<String> includeEmbeddedResourceTypes,
+                                      Set<String> 
excludeEmbeddedResourceTypes) {
+        this.includeMimes = includeMimes;
+        this.excludeMimes = excludeMimes;
+        this.includeEmbeddedResourceTypes = includeEmbeddedResourceTypes;
+        this.excludeEmbeddedResourceTypes = excludeEmbeddedResourceTypes;
+    }
+
+    public boolean select(Metadata metadata) {
+        String mime = metadata.get(Metadata.CONTENT_TYPE);
+        if (mime == null) {
+            mime = "";
+        } else {
+            //if mime matters at all, make sure to get the mime without 
parameters
+            if (includeMimes.size() > 0 || excludeMimes.size() > 0) {
+                MediaType mt = MediaType.parse(mime);
+                if (mt != null) {
+                    mime = mt.getType() + "/" + mt.getSubtype();
+                }
+            }
+        }
+        if (excludeMimes.contains(mime)) {
+            return false;
+        }
+        if (includeMimes.size() > 0 && ! includeMimes.contains(mime)) {
+            return false;
+        }
+        String embeddedResourceType = 
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+        //if a parser doesn't specify the type, treat it as ATTACHMENT
+        embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? 
"ATTACHMENT" :
+                embeddedResourceType;
+
+        if (excludeEmbeddedResourceTypes.contains(embeddedResourceType)) {
+            return false;
+        }
+        if (includeEmbeddedResourceTypes.size() > 0 && 
includeEmbeddedResourceTypes.contains(embeddedResourceType)) {
+            return true;
+        }
+        return false;
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
 b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
similarity index 55%
copy from 
tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
copy to 
tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
index 9136228c4..2ec7df667 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
@@ -16,25 +16,17 @@
  */
 package org.apache.tika.extractor;
 
-import org.apache.tika.config.Field;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
 
-public class ParsingEmbeddedDocumentExtractorFactory
-        implements EmbeddedDocumentExtractorFactory {
+public interface EmbeddedBytesSelector {
 
-    private boolean writeFileNameToContent = true;
-
-    @Field
-    public void setWriteFileNameToContent(boolean writeFileNameToContent) {
-        this.writeFileNameToContent = writeFileNameToContent;
+    class AcceptAll implements EmbeddedBytesSelector {
+        @Override
+        public boolean select(Metadata metadata) {
+            return true;
+        }
     }
+    EmbeddedBytesSelector ACCEPT_ALL = new AcceptAll();
 
-    @Override
-    public EmbeddedDocumentExtractor newInstance(Metadata metadata, 
ParseContext parseContext) {
-        ParsingEmbeddedDocumentExtractor ex =
-                new ParsingEmbeddedDocumentExtractor(parseContext);
-        ex.setWriteFileNameToContent(writeFileNameToContent);
-        return ex;
-    }
+    boolean select(Metadata metadata);
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index 46672838b..ee15c1e22 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -26,6 +26,8 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
@@ -43,6 +45,7 @@ import org.apache.tika.parser.ParseRecord;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
 
 /**
  * Helper class for parsers of package archives or other compound document
@@ -52,6 +55,9 @@ import org.apache.tika.sax.EmbeddedContentHandler;
  */
 public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtractor {
 
+    private static final Logger LOGGER =
+            LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class);
+
     private static final File ABSTRACT_PATH = new File("");
 
     private static final Parser DELEGATING_PARSER = new DelegatingParser();
@@ -60,6 +66,8 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
 
     private final ParseContext context;
 
+    private EmbeddedBytesSelector embeddedBytesSelector = 
EmbeddedBytesSelector.ACCEPT_ALL;
+
     public ParsingEmbeddedDocumentExtractor(ParseContext context) {
         this.context = context;
     }
@@ -147,6 +155,14 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
     }
 
     private void storeEmbeddedBytes(Path p, Metadata metadata) {
+        if (! embeddedBytesSelector.select(metadata)) {
+            if (LOGGER.isDebugEnabled()) {
+                LOGGER.debug("skipping embedded bytes {} {}",
+                        metadata.get(Metadata.CONTENT_TYPE),
+                        
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+            }
+            return;
+        }
         EmbeddedDocumentByteStore embeddedDocumentByteStore =
                 context.get(EmbeddedDocumentByteStore.class);
         int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
@@ -154,8 +170,8 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
         try {
             embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p));
         } catch (IOException e) {
-            e.printStackTrace();
-            //log, or better, store embdocstore exception
+            metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
+                    ExceptionUtils.getStackTrace(e));
         }
     }
 
@@ -175,4 +191,12 @@ public class ParsingEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtract
     public void setWriteFileNameToContent(boolean writeFileNameToContent) {
         this.writeFileNameToContent = writeFileNameToContent;
     }
+
+    public void setEmbeddedBytesSelector(EmbeddedBytesSelector 
embeddedBytesSelector) {
+        this.embeddedBytesSelector = embeddedBytesSelector;
+    }
+
+    public EmbeddedBytesSelector getEmbeddedBytesSelector() {
+        return embeddedBytesSelector;
+    }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index 9136228c4..7632ed49c 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -16,25 +16,73 @@
  */
 package org.apache.tika.extractor;
 
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
 import org.apache.tika.config.Field;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 
-public class ParsingEmbeddedDocumentExtractorFactory
-        implements EmbeddedDocumentExtractorFactory {
+public class ParsingEmbeddedDocumentExtractorFactory implements 
EmbeddedDocumentExtractorFactory {
 
     private boolean writeFileNameToContent = true;
+    private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET;
+    private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET;
+    private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = 
Collections.EMPTY_SET;
+    private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = 
Collections.EMPTY_SET;
 
     @Field
     public void setWriteFileNameToContent(boolean writeFileNameToContent) {
         this.writeFileNameToContent = writeFileNameToContent;
     }
 
+    @Field
+    public void setEmbeddedBytesIncludeMimeTypes(List<String> 
includeMimeTypes) {
+        embeddedBytesIncludeMimeTypes = new HashSet<>();
+        embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes);
+    }
+
+    @Field
+    public void setEmbeddedBytesExcludeMimeTypes(List<String> 
excludeMimeTypes) {
+        embeddedBytesExcludeMimeTypes = new HashSet<>();
+        embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes);
+
+    }
+
+    @Field
+    public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List<String> 
includeAttachmentTypes) {
+        embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>();
+        
embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes);
+
+    }
+
+    @Field
+    public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List<String> 
excludeAttachmentTypes) {
+        embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>();
+        
embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes);
+
+    }
+
+
     @Override
     public EmbeddedDocumentExtractor newInstance(Metadata metadata, 
ParseContext parseContext) {
-        ParsingEmbeddedDocumentExtractor ex =
-                new ParsingEmbeddedDocumentExtractor(parseContext);
+        ParsingEmbeddedDocumentExtractor ex = new 
ParsingEmbeddedDocumentExtractor(parseContext);
         ex.setWriteFileNameToContent(writeFileNameToContent);
+        ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
         return ex;
     }
+
+    private EmbeddedBytesSelector createEmbeddedBytesSelector() {
+        if (embeddedBytesIncludeMimeTypes.size() == 0 &&
+                embeddedBytesExcludeMimeTypes.size() == 0 &&
+                embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 &&
+                embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) {
+            return EmbeddedBytesSelector.ACCEPT_ALL;
+        }
+        return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes,
+                embeddedBytesExcludeMimeTypes, 
embeddedBytesIncludeEmbeddedResourceTypes,
+                embeddedBytesExcludeEmbeddedResourceTypes);
+    }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 6ff02c1cf..effa4a667 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -98,6 +98,10 @@ public interface TikaCoreProperties {
     Property EMBEDDED_EXCEPTION =
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + 
"embedded_exception");
 
+    //exception handling the raw bytes of an embedded file by an 
EmbeddedDocumentByteStore
+    Property EMBEDDED_BYTES_EXCEPTION =
+            Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + 
"embedded_bytes_exception");
+
     //warning while parsing in an embedded file
     Property EMBEDDED_WARNING =
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + 
"embedded_warning");
diff --git 
a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
 
b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
new file mode 100644
index 000000000..a0d5d4896
--- /dev/null
+++ 
b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.InputStream;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedBytesSelector;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.utils.StringUtils;
+
+public class AutoDetectParserConfigTest {
+
+    @Test
+    public void testEmbeddedBytesSelector() throws Exception {
+        TikaConfig config;
+        try (InputStream is = TikaConfig.class.getResourceAsStream(
+                "TIKA-4207-embedded-bytes-config.xml")) {
+            config = new TikaConfig(is);
+        }
+        AutoDetectParserConfig c = config.getAutoDetectParserConfig();
+        ParsingEmbeddedDocumentExtractorFactory f =
+                (ParsingEmbeddedDocumentExtractorFactory) 
c.getEmbeddedDocumentExtractorFactory();
+
+        Metadata metadata = new Metadata();
+        ParseContext parseContext = new ParseContext();
+        ParsingEmbeddedDocumentExtractor ex = 
(ParsingEmbeddedDocumentExtractor) f.newInstance(metadata, parseContext);
+        EmbeddedBytesSelector selector = ex.getEmbeddedBytesSelector();
+        assertFalse(selector.select(getMetadata("", "")));
+        assertTrue(selector.select(getMetadata("application/pdf", "")));
+        assertTrue(selector.select(getMetadata("application/pdf", 
"ATTACHMENT")));
+        assertTrue(selector.select(getMetadata("application/pdf", "INLINE")));
+        assertTrue(selector.select(getMetadata("text/plain;charset=UTF-7", 
"INLINE")));
+
+        assertFalse(selector.select(getMetadata("application/pdf", "MACRO")));
+        assertFalse(selector.select(getMetadata("application/docx", "")));
+
+    }
+
+    private Metadata getMetadata(String mime, String embeddedResourceType) {
+        Metadata m = new Metadata();
+        if (!StringUtils.isBlank(mime)) {
+            m.set(Metadata.CONTENT_TYPE, mime);
+        }
+        if (!StringUtils.isBlank(embeddedResourceType)) {
+            m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 
embeddedResourceType);
+        }
+        return m;
+    }
+}
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
new file mode 100644
index 000000000..d60c6b1ca
--- /dev/null
+++ 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <spoolToDisk>123450</spoolToDisk>
+    <outputThreshold>678900</outputThreshold>
+    <embeddedDocumentExtractorFactory 
class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+      <writeFileNameToContent>false</writeFileNameToContent>
+      <embeddedBytesIncludeMimeTypes>
+        <mime>application/pdf</mime>
+        <mime>application/rtf</mime>
+        <mime>text/plain</mime>
+      </embeddedBytesIncludeMimeTypes>
+      <embeddedBytesIncludeEmbeddedResourceTypes>
+        <type>ATTACHMENT</type>
+        <type>INLINE</type>
+      </embeddedBytesIncludeEmbeddedResourceTypes>
+    </embeddedDocumentExtractorFactory>
+  </autoDetectParserConfig>
+</properties>
\ No newline at end of file

Reply via email to