This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new c9d1ec8be TIKA-4581 - rm metadata filter where it isn't needed any
more (#2468)
c9d1ec8be is described below
commit c9d1ec8be540b7aea99d6ce52de64f68aae3a979
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 30 16:49:58 2025 -0500
TIKA-4581 - rm metadata filter where it isn't needed any more (#2468)
* TIKA-4581 - rm metadata filter where it isn't needed any more
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 3 +-
...Filter.java => RemoveByMimeMetadataFilter.java} | 35 ++++++------
.../tika/sax/RecursiveParserWrapperHandler.java | 22 +-------
.../tika/parser/RecursiveParserWrapperTest.java | 46 ----------------
.../apache/tika/parser/image/JpegParserTest.java | 4 +-
.../test/resources/configs/TIKA-3137-include.json | 2 +-
.../tika/metadata/filter/TestMetadataFilter.java | 63 +++++++++++++---------
.../test/resources/configs/TIKA-3137-mimes-uc.json | 2 +-
.../core/resource/RecursiveMetadataResource.java | 8 +--
.../test/resources/configs/TIKA-3137-include.json | 2 +-
10 files changed, 66 insertions(+), 121 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 411bd070a..a0f0ddb57 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -553,8 +553,7 @@ public class TikaCLI {
private void handleRecursiveJson(URL url, OutputStream output) throws
IOException, SAXException, TikaException {
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
- RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1,
- tikaLoader.loadMetadataFilters());
+ RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1);
try (TikaInputStream tis = TikaInputStream.get(url, metadata)) {
wrapper.parse(tis, handler, metadata, context);
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/RemoveByMimeMetadataFilter.java
similarity index 74%
rename from
tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
rename to
tika-core/src/main/java/org/apache/tika/metadata/filter/RemoveByMimeMetadataFilter.java
index 5ac8651e8..a7a4c79bd 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/RemoveByMimeMetadataFilter.java
@@ -24,16 +24,17 @@ import java.util.Set;
import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
- * This class clears the entire metadata object if the
+ * This class removes the entire metadata object if the
* mime matches the mime filter. The idea is that you might not want
* to store/transmit metadata for images or specific file types.
*/
@TikaComponent
-public class ClearByMimeMetadataFilter extends MetadataFilterBase {
+public class RemoveByMimeMetadataFilter extends MetadataFilter {
/**
* Configuration class for JSON deserialization.
@@ -44,11 +45,11 @@ public class ClearByMimeMetadataFilter extends
MetadataFilterBase {
private final Set<String> mimes;
- public ClearByMimeMetadataFilter() {
+ public RemoveByMimeMetadataFilter() {
this(new HashSet<>());
}
- public ClearByMimeMetadataFilter(Set<String> mimes) {
+ public RemoveByMimeMetadataFilter(Set<String> mimes) {
this.mimes = mimes;
}
@@ -57,7 +58,7 @@ public class ClearByMimeMetadataFilter extends
MetadataFilterBase {
*
* @param config the configuration
*/
- public ClearByMimeMetadataFilter(Config config) {
+ public RemoveByMimeMetadataFilter(Config config) {
this.mimes = new HashSet<>(config.mimes);
}
@@ -67,25 +68,29 @@ public class ClearByMimeMetadataFilter extends
MetadataFilterBase {
*
* @param jsonConfig JSON configuration
*/
- public ClearByMimeMetadataFilter(JsonConfig jsonConfig) {
+ public RemoveByMimeMetadataFilter(JsonConfig jsonConfig) {
this(ConfigDeserializer.buildConfig(jsonConfig, Config.class));
}
@Override
- protected void filter(Metadata metadata) {
+ public List<Metadata> filter(List<Metadata> metadataList) throws
TikaException {
+ List<Metadata> result = new ArrayList<>(metadataList);
+ result.removeIf(this::shouldRemove);
+ return result;
+ }
+
+ private boolean shouldRemove(Metadata metadata) {
String mimeString = metadata.get(Metadata.CONTENT_TYPE);
if (mimeString == null) {
- return;
+ return false;
}
MediaType mt = MediaType.parse(mimeString);
- if (mt != null) {
- mimeString = mt.getBaseType().toString();
- }
- if (mimes.contains(mimeString)) {
- for (String n : metadata.names()) {
- metadata.remove(n);
- }
+ if (mt == null) {
+ return false;
}
+ mimeString = mt.getBaseType().toString();
+
+ return mimes.contains(mimeString);
}
/**
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index b65fdbd61..154d5733a 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -26,11 +26,8 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.filter.MetadataFilter;
-import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.utils.ParserUtils;
@@ -51,13 +48,12 @@ import org.apache.tika.utils.ParserUtils;
public class RecursiveParserWrapperHandler extends
AbstractRecursiveParserWrapperHandler {
protected final List<Metadata> metadataList = new LinkedList<>();
- private final MetadataFilter metadataFilter;
/**
* Create a handler with no limit on the number of embedded resources
*/
public RecursiveParserWrapperHandler(ContentHandlerFactory
contentHandlerFactory) {
- this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER);
+ super(contentHandlerFactory, -1);
}
/**
@@ -68,13 +64,7 @@ public class RecursiveParserWrapperHandler extends
AbstractRecursiveParserWrappe
*/
public RecursiveParserWrapperHandler(ContentHandlerFactory
contentHandlerFactory,
int maxEmbeddedResources) {
- this(contentHandlerFactory, maxEmbeddedResources,
NoOpFilter.NOOP_FILTER);
- }
-
- public RecursiveParserWrapperHandler(ContentHandlerFactory
contentHandlerFactory,
- int maxEmbeddedResources,
MetadataFilter metadataFilter) {
super(contentHandlerFactory, maxEmbeddedResources);
- this.metadataFilter = metadataFilter;
}
/**
@@ -102,11 +92,6 @@ public class RecursiveParserWrapperHandler extends
AbstractRecursiveParserWrappe
throws SAXException {
super.endEmbeddedDocument(contentHandler, metadata);
addContent(contentHandler, metadata);
- try {
- metadataFilter.filter(List.of(metadata));
- } catch (TikaException e) {
- throw new SAXException(e);
- }
if (metadata.size() > 0) {
metadataList.add(ParserUtils.cloneMetadata(metadata));
@@ -122,11 +107,6 @@ public class RecursiveParserWrapperHandler extends
AbstractRecursiveParserWrappe
public void endDocument(ContentHandler contentHandler, Metadata metadata)
throws SAXException {
super.endDocument(contentHandler, metadata);
addContent(contentHandler, metadata);
- try {
- metadataFilter.filter(List.of(metadata));
- } catch (TikaException e) {
- throw new SAXException(e);
- }
if (metadata.size() > 0) {
metadataList.add(0, ParserUtils.cloneMetadata(metadata));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 3cabccdfa..e009d65b9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -20,7 +20,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assertions.fail;
import java.io.IOException;
import java.io.InputStream;
@@ -33,16 +32,13 @@ import org.apache.commons.io.input.ClosedInputStream;
import org.apache.commons.io.input.ProxyInputStream;
import org.junit.jupiter.api.Test;
-import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.digest.DigestDef;
import org.apache.tika.digest.Digester;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -393,48 +389,6 @@ public class RecursiveParserWrapperTest extends TikaTest {
}
- @Test
- public void testIncludeFilter() throws Exception {
- //TIKA-3137
- ParseContext context = new ParseContext();
- Metadata metadata = new Metadata();
- TikaLoader tikaLoader =
TikaLoaderHelper.getLoader("TIKA-3137-include.json");
- Parser p = tikaLoader.loadAutoDetectParser();
- MetadataFilter metadataFilter = tikaLoader.loadMetadataFilters();
- RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, true);
- String path = "/test-documents/test_recursive_embedded.docx";
- ContentHandlerFactory contentHandlerFactory =
- new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
-
- RecursiveParserWrapperHandler handler =
- new RecursiveParserWrapperHandler(contentHandlerFactory, -1,
- metadataFilter);
- try (TikaInputStream tis = getResourceAsStream(path)) {
- wrapper.parse(tis, handler, metadata, context);
- }
- List<Metadata> metadataList = handler.getMetadataList();
- assertEquals(5, metadataList.size());
-
- Set<String> expectedKeys = new HashSet<>();
- expectedKeys.add("X-TIKA:content");
- expectedKeys.add("extended-properties:Application");
- expectedKeys.add("Content-Type");
- for (Metadata m : metadataList) {
- if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) {
- fail("emf should have been filtered out");
- }
- if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) {
- fail("text/plain should have been filtered out");
- }
- assertTrue(m.names().length >= 2);
- for (String n : m.names()) {
- if (!expectedKeys.contains(n)) {
- fail("didn't expect " + n);
- }
- }
- }
- }
-
@SuppressWarnings("deprecation")
private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory
contentHandlerFactory,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java
index c811b9356..11af0ccef 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java
@@ -45,12 +45,12 @@ public class JpegParserTest extends TikaTest {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
RecursiveParserWrapperHandler handler =
- new RecursiveParserWrapperHandler(new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1),
1000, metadataFilter);
+ new RecursiveParserWrapperHandler(new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1),
1000);
try (InputStream is =
getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg")) {
wrapper.parse(TikaInputStream.get(is), handler, new Metadata(),
new ParseContext());
}
List<Metadata> metadataList = handler.getMetadataList();
-
+ metadataList = metadataFilter.filter(metadataList);
Metadata metadata = metadataList.get(0);
// Geo tags should be there with 5dp, and not rounded
assertEquals("51.575762", metadata.get(Metadata.LATITUDE));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-3137-include.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-3137-include.json
index b18cfb799..b9acf2658 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-3137-include.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-3137-include.json
@@ -10,7 +10,7 @@
}
},
{
- "clear-by-mime-metadata-filter": {
+ "remove-by-mime-metadata-filter": {
"mimes": [
"image/emf",
"text/plain"
diff --git
a/tika-serialization/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
b/tika-serialization/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index 0c24b6860..ecbdef42c 100644
---
a/tika-serialization/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++
b/tika-serialization/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -127,40 +127,51 @@ public class TestMetadataFilter extends TikaTest {
}
@Test
- public void testMimeClearingFilter() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE,
MediaType.image("jpeg").toString());
- metadata.set("author", "author");
-
- MetadataFilter filter = new
ClearByMimeMetadataFilter(set("image/jpeg", "application/pdf"));
- metadata = filterOne(filter, metadata);
- assertEquals(0, metadata.size());
-
- metadata.set(Metadata.CONTENT_TYPE,
MediaType.text("plain").toString());
- metadata.set("author", "author");
- metadata = filterOne(filter, metadata);
- assertEquals(2, metadata.size());
- assertEquals("author", metadata.get("author"));
-
+ public void testMimeRemovingFilter() throws Exception {
+ Metadata jpegMetadata = new Metadata();
+ jpegMetadata.set(Metadata.CONTENT_TYPE,
MediaType.image("jpeg").toString());
+ jpegMetadata.set("author", "author");
+
+ Metadata plainMetadata = new Metadata();
+ plainMetadata.set(Metadata.CONTENT_TYPE,
MediaType.text("plain").toString());
+ plainMetadata.set("author", "author");
+
+ MetadataFilter filter = new
RemoveByMimeMetadataFilter(set("image/jpeg", "application/pdf"));
+
+ // jpeg should be removed
+ List<Metadata> result = filter.filter(List.of(jpegMetadata));
+ assertEquals(0, result.size());
+
+ // text/plain should be kept
+ result = filter.filter(List.of(plainMetadata));
+ assertEquals(1, result.size());
+ assertEquals(2, result.get(0).size());
+ assertEquals("author", result.get(0).get("author"));
}
@Test
- public void testMimeClearingFilterConfig() throws Exception {
+ public void testMimeRemovingFilterConfig() throws Exception {
TikaLoader loader = TikaLoader.load(getConfigPath(getClass(),
"TIKA-3137-mimes-uc.json"));
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE,
MediaType.image("jpeg").toString());
- metadata.set("author", "author");
+ Metadata jpegMetadata = new Metadata();
+ jpegMetadata.set(Metadata.CONTENT_TYPE,
MediaType.image("jpeg").toString());
+ jpegMetadata.set("author", "author");
+
+ Metadata plainMetadata = new Metadata();
+ plainMetadata.set(Metadata.CONTENT_TYPE,
MediaType.text("plain").toString());
+ plainMetadata.set("author", "author");
MetadataFilter filter = loader.get(MetadataFilter.class);
- metadata = filterOne(filter, metadata);
- assertEquals(0, metadata.size());
- metadata.set(Metadata.CONTENT_TYPE,
MediaType.text("plain").toString());
- metadata.set("author", "author");
- metadata = filterOne(filter, metadata);
- assertEquals(2, metadata.size());
- assertEquals("AUTHOR", metadata.get("author"));
+ // jpeg should be removed
+ List<Metadata> result = filter.filter(List.of(jpegMetadata));
+ assertEquals(0, result.size());
+
+ // text/plain should be kept and upper-cased by mock-upper-case-filter
+ result = filter.filter(List.of(plainMetadata));
+ assertEquals(1, result.size());
+ assertEquals(2, result.get(0).size());
+ assertEquals("AUTHOR", result.get(0).get("author"));
}
@Test
diff --git
a/tika-serialization/src/test/resources/configs/TIKA-3137-mimes-uc.json
b/tika-serialization/src/test/resources/configs/TIKA-3137-mimes-uc.json
index 036f23a53..62f40ac87 100644
--- a/tika-serialization/src/test/resources/configs/TIKA-3137-mimes-uc.json
+++ b/tika-serialization/src/test/resources/configs/TIKA-3137-mimes-uc.json
@@ -1,7 +1,7 @@
{
"metadata-filters": [
{
- "clear-by-mime-metadata-filter": {
+ "remove-by-mime-metadata-filter": {
"mimes": ["image/jpeg", "application/pdf"]
}
},
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 3f71ae67e..61d6f1440 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -72,9 +72,7 @@ public class RecursiveMetadataResource {
BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
RecursiveParserWrapperHandler handler =
new RecursiveParserWrapperHandler(new
BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(),
handlerConfig.isThrowOnWriteLimitReached(), context),
- handlerConfig.getMaxEmbeddedResources(), TikaResource
- .getTikaLoader()
- .loadMetadataFilters());
+ handlerConfig.getMaxEmbeddedResources());
try {
TikaResource.parse(wrapper, LOG, "/rmeta", tis, handler, metadata,
context);
} catch (TikaServerParseException e) {
@@ -179,9 +177,7 @@ public class RecursiveMetadataResource {
BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
RecursiveParserWrapperHandler handler =
new RecursiveParserWrapperHandler(new
BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(),
handlerConfig.isThrowOnWriteLimitReached(), context),
- handlerConfig.getMaxEmbeddedResources(), TikaResource
- .getTikaLoader()
- .loadMetadataFilters());
+ handlerConfig.getMaxEmbeddedResources());
try {
TikaResource.parse(wrapper, LOG, "/rmeta/config", tis, handler,
metadata, context);
} catch (TikaServerParseException e) {
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/TIKA-3137-include.json
b/tika-server/tika-server-standard/src/test/resources/configs/TIKA-3137-include.json
index 5b68d569c..0355de40d 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/TIKA-3137-include.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/TIKA-3137-include.json
@@ -10,7 +10,7 @@
}
},
{
- "clear-by-mime-metadata-filter": {
+ "remove-by-mime-metadata-filter": {
"mimes": [
"image/emf",
"text/plain"