This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new c9d1ec8be TIKA-4581 - rm metadata filter where it isn't needed any 
more (#2468)
c9d1ec8be is described below

commit c9d1ec8be540b7aea99d6ce52de64f68aae3a979
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 30 16:49:58 2025 -0500

    TIKA-4581 - rm metadata filter where it isn't needed any more (#2468)
    
    * TIKA-4581 - rm metadata filter where it isn't needed any more
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  3 +-
 ...Filter.java => RemoveByMimeMetadataFilter.java} | 35 ++++++------
 .../tika/sax/RecursiveParserWrapperHandler.java    | 22 +-------
 .../tika/parser/RecursiveParserWrapperTest.java    | 46 ----------------
 .../apache/tika/parser/image/JpegParserTest.java   |  4 +-
 .../test/resources/configs/TIKA-3137-include.json  |  2 +-
 .../tika/metadata/filter/TestMetadataFilter.java   | 63 +++++++++++++---------
 .../test/resources/configs/TIKA-3137-mimes-uc.json |  2 +-
 .../core/resource/RecursiveMetadataResource.java   |  8 +--
 .../test/resources/configs/TIKA-3137-include.json  |  2 +-
 10 files changed, 66 insertions(+), 121 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 411bd070a..a0f0ddb57 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -553,8 +553,7 @@ public class TikaCLI {
     private void handleRecursiveJson(URL url, OutputStream output) throws 
IOException, SAXException, TikaException {
         Metadata metadata = new Metadata();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
-        RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1,
-                tikaLoader.loadMetadataFilters());
+        RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1);
         try (TikaInputStream tis = TikaInputStream.get(url, metadata)) {
             wrapper.parse(tis, handler, metadata, context);
         }
diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
 
b/tika-core/src/main/java/org/apache/tika/metadata/filter/RemoveByMimeMetadataFilter.java
similarity index 74%
rename from 
tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
rename to 
tika-core/src/main/java/org/apache/tika/metadata/filter/RemoveByMimeMetadataFilter.java
index 5ac8651e8..a7a4c79bd 100644
--- 
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
+++ 
b/tika-core/src/main/java/org/apache/tika/metadata/filter/RemoveByMimeMetadataFilter.java
@@ -24,16 +24,17 @@ import java.util.Set;
 import org.apache.tika.config.ConfigDeserializer;
 import org.apache.tika.config.JsonConfig;
 import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
 /**
- * This class clears the entire metadata object if the
+ * This class removes the entire metadata object if the
  * mime matches the mime filter.  The idea is that you might not want
  * to store/transmit metadata for images or specific file types.
  */
 @TikaComponent
-public class ClearByMimeMetadataFilter extends MetadataFilterBase {
+public class RemoveByMimeMetadataFilter extends MetadataFilter {
 
     /**
      * Configuration class for JSON deserialization.
@@ -44,11 +45,11 @@ public class ClearByMimeMetadataFilter extends 
MetadataFilterBase {
 
     private final Set<String> mimes;
 
-    public ClearByMimeMetadataFilter() {
+    public RemoveByMimeMetadataFilter() {
         this(new HashSet<>());
     }
 
-    public ClearByMimeMetadataFilter(Set<String> mimes) {
+    public RemoveByMimeMetadataFilter(Set<String> mimes) {
         this.mimes = mimes;
     }
 
@@ -57,7 +58,7 @@ public class ClearByMimeMetadataFilter extends 
MetadataFilterBase {
      *
      * @param config the configuration
      */
-    public ClearByMimeMetadataFilter(Config config) {
+    public RemoveByMimeMetadataFilter(Config config) {
         this.mimes = new HashSet<>(config.mimes);
     }
 
@@ -67,25 +68,29 @@ public class ClearByMimeMetadataFilter extends 
MetadataFilterBase {
      *
      * @param jsonConfig JSON configuration
      */
-    public ClearByMimeMetadataFilter(JsonConfig jsonConfig) {
+    public RemoveByMimeMetadataFilter(JsonConfig jsonConfig) {
         this(ConfigDeserializer.buildConfig(jsonConfig, Config.class));
     }
 
     @Override
-    protected void filter(Metadata metadata) {
+    public List<Metadata> filter(List<Metadata> metadataList) throws 
TikaException {
+        List<Metadata> result = new ArrayList<>(metadataList);
+        result.removeIf(this::shouldRemove);
+        return result;
+    }
+
+    private boolean shouldRemove(Metadata metadata) {
         String mimeString = metadata.get(Metadata.CONTENT_TYPE);
         if (mimeString == null) {
-            return;
+            return false;
         }
         MediaType mt = MediaType.parse(mimeString);
-        if (mt != null) {
-            mimeString = mt.getBaseType().toString();
-        }
-        if (mimes.contains(mimeString)) {
-            for (String n : metadata.names()) {
-                metadata.remove(n);
-            }
+        if (mt == null) {
+            return false;
         }
+        mimeString = mt.getBaseType().toString();
+
+        return mimes.contains(mimeString);
     }
 
     /**
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
 
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index b65fdbd61..154d5733a 100644
--- 
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ 
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -26,11 +26,8 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.filter.MetadataFilter;
-import org.apache.tika.metadata.filter.NoOpFilter;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.utils.ParserUtils;
 
@@ -51,13 +48,12 @@ import org.apache.tika.utils.ParserUtils;
 public class RecursiveParserWrapperHandler extends 
AbstractRecursiveParserWrapperHandler {
 
     protected final List<Metadata> metadataList = new LinkedList<>();
-    private final MetadataFilter metadataFilter;
 
     /**
      * Create a handler with no limit on the number of embedded resources
      */
     public RecursiveParserWrapperHandler(ContentHandlerFactory 
contentHandlerFactory) {
-        this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER);
+        super(contentHandlerFactory, -1);
     }
 
     /**
@@ -68,13 +64,7 @@ public class RecursiveParserWrapperHandler extends 
AbstractRecursiveParserWrappe
      */
     public RecursiveParserWrapperHandler(ContentHandlerFactory 
contentHandlerFactory,
                                          int maxEmbeddedResources) {
-        this(contentHandlerFactory, maxEmbeddedResources, 
NoOpFilter.NOOP_FILTER);
-    }
-
-    public RecursiveParserWrapperHandler(ContentHandlerFactory 
contentHandlerFactory,
-                                         int maxEmbeddedResources, 
MetadataFilter metadataFilter) {
         super(contentHandlerFactory, maxEmbeddedResources);
-        this.metadataFilter = metadataFilter;
     }
 
     /**
@@ -102,11 +92,6 @@ public class RecursiveParserWrapperHandler extends 
AbstractRecursiveParserWrappe
             throws SAXException {
         super.endEmbeddedDocument(contentHandler, metadata);
         addContent(contentHandler, metadata);
-        try {
-            metadataFilter.filter(List.of(metadata));
-        } catch (TikaException e) {
-            throw new SAXException(e);
-        }
 
         if (metadata.size() > 0) {
             metadataList.add(ParserUtils.cloneMetadata(metadata));
@@ -122,11 +107,6 @@ public class RecursiveParserWrapperHandler extends 
AbstractRecursiveParserWrappe
     public void endDocument(ContentHandler contentHandler, Metadata metadata) 
throws SAXException {
         super.endDocument(contentHandler, metadata);
         addContent(contentHandler, metadata);
-        try {
-            metadataFilter.filter(List.of(metadata));
-        } catch (TikaException e) {
-            throw new SAXException(e);
-        }
         if (metadata.size() > 0) {
             metadataList.add(0, ParserUtils.cloneMetadata(metadata));
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 3cabccdfa..e009d65b9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -20,7 +20,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assertions.fail;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -33,16 +32,13 @@ import org.apache.commons.io.input.ClosedInputStream;
 import org.apache.commons.io.input.ProxyInputStream;
 import org.junit.jupiter.api.Test;
 
-import org.apache.tika.TikaLoaderHelper;
 import org.apache.tika.TikaTest;
-import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.digest.DigestDef;
 import org.apache.tika.digest.Digester;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.filter.MetadataFilter;
 import org.apache.tika.parser.digestutils.CommonsDigester;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -393,48 +389,6 @@ public class RecursiveParserWrapperTest extends TikaTest {
 
     }
 
-    @Test
-    public void testIncludeFilter() throws Exception {
-        //TIKA-3137
-        ParseContext context = new ParseContext();
-        Metadata metadata = new Metadata();
-        TikaLoader tikaLoader = 
TikaLoaderHelper.getLoader("TIKA-3137-include.json");
-        Parser p = tikaLoader.loadAutoDetectParser();
-        MetadataFilter metadataFilter = tikaLoader.loadMetadataFilters();
-        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, true);
-        String path = "/test-documents/test_recursive_embedded.docx";
-        ContentHandlerFactory contentHandlerFactory =
-                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
-
-        RecursiveParserWrapperHandler handler =
-                new RecursiveParserWrapperHandler(contentHandlerFactory, -1,
-                        metadataFilter);
-        try (TikaInputStream tis = getResourceAsStream(path)) {
-            wrapper.parse(tis, handler, metadata, context);
-        }
-        List<Metadata> metadataList = handler.getMetadataList();
-        assertEquals(5, metadataList.size());
-
-        Set<String> expectedKeys = new HashSet<>();
-        expectedKeys.add("X-TIKA:content");
-        expectedKeys.add("extended-properties:Application");
-        expectedKeys.add("Content-Type");
-        for (Metadata m : metadataList) {
-            if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) {
-                fail("emf should have been filtered out");
-            }
-            if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) {
-                fail("text/plain should have been filtered out");
-            }
-            assertTrue(m.names().length >= 2);
-            for (String n : m.names()) {
-                if (!expectedKeys.contains(n)) {
-                    fail("didn't expect " + n);
-                }
-            }
-        }
-    }
-
     @SuppressWarnings("deprecation")
     private List<Metadata> getMetadata(Metadata metadata,
                                        ContentHandlerFactory 
contentHandlerFactory,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java
index c811b9356..11af0ccef 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java
@@ -45,12 +45,12 @@ public class JpegParserTest extends TikaTest {
 
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
         RecursiveParserWrapperHandler handler =
-                new RecursiveParserWrapperHandler(new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1), 
1000, metadataFilter);
+                new RecursiveParserWrapperHandler(new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1), 
1000);
         try (InputStream is = 
getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg")) {
             wrapper.parse(TikaInputStream.get(is), handler, new Metadata(), 
new ParseContext());
         }
         List<Metadata> metadataList = handler.getMetadataList();
-
+        metadataList = metadataFilter.filter(metadataList);
         Metadata metadata = metadataList.get(0);
         // Geo tags should be there with 5dp, and not rounded
         assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-3137-include.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-3137-include.json
index b18cfb799..b9acf2658 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-3137-include.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-3137-include.json
@@ -10,7 +10,7 @@
       }
     },
     {
-      "clear-by-mime-metadata-filter": {
+      "remove-by-mime-metadata-filter": {
         "mimes": [
           "image/emf",
           "text/plain"
diff --git 
a/tika-serialization/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
 
b/tika-serialization/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index 0c24b6860..ecbdef42c 100644
--- 
a/tika-serialization/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++ 
b/tika-serialization/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -127,40 +127,51 @@ public class TestMetadataFilter extends TikaTest {
     }
 
     @Test
-    public void testMimeClearingFilter() throws Exception {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, 
MediaType.image("jpeg").toString());
-        metadata.set("author", "author");
-
-        MetadataFilter filter = new 
ClearByMimeMetadataFilter(set("image/jpeg", "application/pdf"));
-        metadata = filterOne(filter, metadata);
-        assertEquals(0, metadata.size());
-
-        metadata.set(Metadata.CONTENT_TYPE, 
MediaType.text("plain").toString());
-        metadata.set("author", "author");
-        metadata = filterOne(filter, metadata);
-        assertEquals(2, metadata.size());
-        assertEquals("author", metadata.get("author"));
-
+    public void testMimeRemovingFilter() throws Exception {
+        Metadata jpegMetadata = new Metadata();
+        jpegMetadata.set(Metadata.CONTENT_TYPE, 
MediaType.image("jpeg").toString());
+        jpegMetadata.set("author", "author");
+
+        Metadata plainMetadata = new Metadata();
+        plainMetadata.set(Metadata.CONTENT_TYPE, 
MediaType.text("plain").toString());
+        plainMetadata.set("author", "author");
+
+        MetadataFilter filter = new 
RemoveByMimeMetadataFilter(set("image/jpeg", "application/pdf"));
+
+        // jpeg should be removed
+        List<Metadata> result = filter.filter(List.of(jpegMetadata));
+        assertEquals(0, result.size());
+
+        // text/plain should be kept
+        result = filter.filter(List.of(plainMetadata));
+        assertEquals(1, result.size());
+        assertEquals(2, result.get(0).size());
+        assertEquals("author", result.get(0).get("author"));
     }
 
     @Test
-    public void testMimeClearingFilterConfig() throws Exception {
+    public void testMimeRemovingFilterConfig() throws Exception {
         TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), 
"TIKA-3137-mimes-uc.json"));
 
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.CONTENT_TYPE, 
MediaType.image("jpeg").toString());
-        metadata.set("author", "author");
+        Metadata jpegMetadata = new Metadata();
+        jpegMetadata.set(Metadata.CONTENT_TYPE, 
MediaType.image("jpeg").toString());
+        jpegMetadata.set("author", "author");
+
+        Metadata plainMetadata = new Metadata();
+        plainMetadata.set(Metadata.CONTENT_TYPE, 
MediaType.text("plain").toString());
+        plainMetadata.set("author", "author");
 
         MetadataFilter filter = loader.get(MetadataFilter.class);
-        metadata = filterOne(filter, metadata);
-        assertEquals(0, metadata.size());
 
-        metadata.set(Metadata.CONTENT_TYPE, 
MediaType.text("plain").toString());
-        metadata.set("author", "author");
-        metadata = filterOne(filter, metadata);
-        assertEquals(2, metadata.size());
-        assertEquals("AUTHOR", metadata.get("author"));
+        // jpeg should be removed
+        List<Metadata> result = filter.filter(List.of(jpegMetadata));
+        assertEquals(0, result.size());
+
+        // text/plain should be kept and upper-cased by mock-upper-case-filter
+        result = filter.filter(List.of(plainMetadata));
+        assertEquals(1, result.size());
+        assertEquals(2, result.get(0).size());
+        assertEquals("AUTHOR", result.get(0).get("author"));
     }
 
     @Test
diff --git 
a/tika-serialization/src/test/resources/configs/TIKA-3137-mimes-uc.json 
b/tika-serialization/src/test/resources/configs/TIKA-3137-mimes-uc.json
index 036f23a53..62f40ac87 100644
--- a/tika-serialization/src/test/resources/configs/TIKA-3137-mimes-uc.json
+++ b/tika-serialization/src/test/resources/configs/TIKA-3137-mimes-uc.json
@@ -1,7 +1,7 @@
 {
   "metadata-filters": [
     {
-      "clear-by-mime-metadata-filter": {
+      "remove-by-mime-metadata-filter": {
         "mimes": ["image/jpeg", "application/pdf"]
       }
     },
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 3f71ae67e..61d6f1440 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -72,9 +72,7 @@ public class RecursiveMetadataResource {
         BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
         RecursiveParserWrapperHandler handler =
                 new RecursiveParserWrapperHandler(new 
BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), 
handlerConfig.isThrowOnWriteLimitReached(), context),
-                        handlerConfig.getMaxEmbeddedResources(), TikaResource
-                        .getTikaLoader()
-                        .loadMetadataFilters());
+                        handlerConfig.getMaxEmbeddedResources());
         try {
             TikaResource.parse(wrapper, LOG, "/rmeta", tis, handler, metadata, 
context);
         } catch (TikaServerParseException e) {
@@ -179,9 +177,7 @@ public class RecursiveMetadataResource {
         BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
         RecursiveParserWrapperHandler handler =
                 new RecursiveParserWrapperHandler(new 
BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), 
handlerConfig.isThrowOnWriteLimitReached(), context),
-                        handlerConfig.getMaxEmbeddedResources(), TikaResource
-                        .getTikaLoader()
-                        .loadMetadataFilters());
+                        handlerConfig.getMaxEmbeddedResources());
         try {
             TikaResource.parse(wrapper, LOG, "/rmeta/config", tis, handler, 
metadata, context);
         } catch (TikaServerParseException e) {
diff --git 
a/tika-server/tika-server-standard/src/test/resources/configs/TIKA-3137-include.json
 
b/tika-server/tika-server-standard/src/test/resources/configs/TIKA-3137-include.json
index 5b68d569c..0355de40d 100644
--- 
a/tika-server/tika-server-standard/src/test/resources/configs/TIKA-3137-include.json
+++ 
b/tika-server/tika-server-standard/src/test/resources/configs/TIKA-3137-include.json
@@ -10,7 +10,7 @@
       }
     },
     {
-      "clear-by-mime-metadata-filter": {
+      "remove-by-mime-metadata-filter": {
         "mimes": [
           "image/emf",
           "text/plain"

Reply via email to