This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 72035660a TIKA-4546 -- simplify metadata filtering (#2405)
72035660a is described below
commit 72035660a759858a60b8bbc034c9f7b10f645faa
Author: Tim Allison <[email protected]>
AuthorDate: Tue Nov 25 14:46:57 2025 -0500
TIKA-4546 -- simplify metadata filtering (#2405)
---
CHANGES.txt | 18 ++++++-
.../tika/annotation/TikaComponentProcessor.java | 2 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../src/main/java/org/apache/tika/gui/TikaGUI.java | 2 +-
.../java/org/apache/tika/config/TikaConfig.java | 11 ----
.../filter/CaptureGroupMetadataFilter.java | 5 +-
.../ClearByAttachmentTypeMetadataFilter.java | 9 ++--
.../metadata/filter/ClearByMimeMetadataFilter.java | 5 +-
.../metadata/filter/CompositeMetadataFilter.java | 7 +--
.../filter/DateNormalizingMetadataFilter.java | 6 +--
.../filter/ExcludeFieldMetadataFilter.java | 5 +-
.../metadata/filter/FieldNameMappingFilter.java | 5 +-
.../metadata/filter/GeoPointMetadataFilter.java | 5 +-
.../filter/IncludeFieldMetadataFilter.java | 5 +-
.../tika/metadata/filter/MetadataFilter.java | 21 ++++----
.../MetadataFilterBase.java} | 18 +++++--
.../apache/tika/metadata/filter/NoOpFilter.java | 7 ++-
.../listfilter/CompositeMetadataListFilter.java | 58 ----------------------
.../metadata/listfilter/MetadataListFilter.java | 51 -------------------
.../tika/sax/RecursiveParserWrapperHandler.java | 4 +-
.../AttachmentCountingListFilter.java | 4 +-
.../MetadataListFilterTest.java | 6 +--
.../tika/metadata/filter/MockUpperCaseFilter.java | 5 +-
.../tika/metadata/filter/TestMetadataFilter.java | 40 ++++++++-------
.../metadatafilter-config.xml} | 6 +--
.../eval/core/metadata/TikaEvalMetadataFilter.java | 7 ++-
.../core/metadata/TikaEvalMetadataFilterTest.java | 4 +-
.../metadatafilter/OpenNLPMetadataFilter.java | 7 ++-
.../metadatafilter/OptimaizeMetadataFilter.java | 7 ++-
.../org/apache/tika/pipes/core/PassbackFilter.java | 4 +-
.../org/apache/tika/pipes/core/PipesServer.java | 26 ++--------
.../apache/tika/pipes/core/PipesClientTest.java | 12 ++---
.../pipes/emitter/solr/SolrEmitterDevTest.java | 3 +-
.../org/apache/tika/config/loader/TikaLoader.java | 8 +--
.../core/resource/RecursiveMetadataResource.java | 8 +--
.../tika/server/core/resource/TikaResource.java | 16 ++++--
36 files changed, 154 insertions(+), 255 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 2d91e48c2..bea9b5235 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,16 +1,30 @@
Release 4.0.0-BETA1 - ???
+
BREAKING CHANGES
+ * Moved towards default json based configuration (TIKA-4544 and many
others).
+
+ * tika-pipes implementation modules have been reorganized by
+ resource (tika-pipes-solr) vs task (tika-pipes-fetcher-solr)
+ (TIKA-4543). Note that the file-system pipes components have
+ been taken out of tika-pipes-core and placed in their own
+ pf4j module: tika-pipes-file-system.
+
+ * tika-pipes implementation modules are now pf4j plugins (TIKA-4519).
+
* tika-pipes core classes have been moved to a new module: tika-pipes-core,
and the FileSystem pipes components have moved (TIKA-4334).
- * Headers are no longer injected into the body/content of MSG files
(TIKA-4345). Please open
- a ticket if you need this behavior across email formats.
+ * MetadataListFilter has been renamed MetadataFilter, and
+ MetadataFilter has been removed (TIKA-4546).
* Removed several modules, including: tika-batch (TIKA-4333), snaps
deployment (TIKA-4502),
dotnet (TIKA-4332), advanced media module (TIKA-4500), tika-dl module
(TIKA-4499),
tika-fuzzing module (TIKA-4506).
+ * Headers are no longer injected into the body/content of MSG files
(TIKA-4345). Please open
+ a ticket if you need this behavior across email formats.
+
* API changes in the EmbeddedStreamTranslator (TIKA-4518).
OTHER CHANGES
diff --git
a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
index 3a1800679..f37ac1e3f 100644
---
a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
+++
b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
@@ -70,7 +70,7 @@ public class TikaComponentProcessor extends AbstractProcessor
{
SERVICE_INTERFACES.put("org.apache.tika.detect.EncodingDetector",
"encoding-detectors");
SERVICE_INTERFACES.put("org.apache.tika.language.translate.Translator",
"translators");
SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer",
"renderers");
-
SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter",
"metadata-filters");
+
SERVICE_INTERFACES.put("org.apache.tika.metadata.listfilter.MetadataFilter",
"metadata-filters");
}
private Messager messager;
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 2cb3f7f5d..050c00dfe 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -525,7 +525,7 @@ public class TikaCLI {
JsonMetadataList.setPrettyPrinting(prettyPrint);
try (Writer writer = getOutputWriter(output, encoding)) {
List<Metadata> metadataList = handler.getMetadataList();
- metadataList = config.getMetadataListFilter().filter(metadataList);
+ metadataList = config.getMetadataFilter().filter(metadataList);
JsonMetadataList.toJson(metadataList, writer);
}
}
diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index 1da9cda9c..a2a4d526e 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -384,7 +384,7 @@ public class TikaGUI extends JFrame implements
ActionListener, HyperlinkListener
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
List<Metadata> metadataList =
recursiveParserWrapperHandler.getMetadataList();
- metadataList =
tikaConfig.getMetadataListFilter().filter(metadataList);
+ metadataList = tikaConfig.getMetadataFilter().filter(metadataList);
JsonMetadataList.toJson(metadataList, jsonBuffer);
setText(json, jsonBuffer.toString());
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 14ff21b11..fc7e84d78 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -64,8 +64,6 @@ import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.NoOpFilter;
-import org.apache.tika.metadata.listfilter.MetadataListFilter;
-import org.apache.tika.metadata.listfilter.NoOpListFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypeException;
@@ -108,7 +106,6 @@ public class TikaConfig {
private final EncodingDetector encodingDetector;
private final Renderer renderer;
private final MetadataFilter metadataFilter;
- private final MetadataListFilter metadataListFilter;
private final AutoDetectParserConfig autoDetectParserConfig;
private static final Map<String, InitializableProblemHandler> strategyMap
= new HashMap<>();
@@ -191,7 +188,6 @@ public class TikaConfig {
this.translator = translatorLoader.loadOverall(element, mimeTypes,
loader);
this.executorService = executorLoader.loadOverall(element, mimeTypes,
loader);
this.metadataFilter = MetadataFilter.load(element, true);
- this.metadataListFilter = MetadataListFilter.load(element, true);
this.autoDetectParserConfig = AutoDetectParserConfig.load(element);
this.serviceLoader = loader;
setMaxJsonStringFieldLength(element);
@@ -220,7 +216,6 @@ public class TikaConfig {
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = new NoOpFilter();
- this.metadataListFilter = new NoOpListFilter();
this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT;
TIMES_INSTANTIATED.incrementAndGet();
}
@@ -267,7 +262,6 @@ public class TikaConfig {
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = new NoOpFilter();
- this.metadataListFilter = new NoOpListFilter();
this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT;
} else {
ServiceLoader tmpServiceLoader = new ServiceLoader();
@@ -295,7 +289,6 @@ public class TikaConfig {
this.executorService =
executorLoader.loadOverall(element, mimeTypes,
serviceLoader);
this.metadataFilter = MetadataFilter.load(element, true);
- this.metadataListFilter = MetadataListFilter.load(element,
true);
this.autoDetectParserConfig =
AutoDetectParserConfig.load(element);
setMaxJsonStringFieldLength(element);
} catch (SAXException e) {
@@ -644,10 +637,6 @@ public class TikaConfig {
return metadataFilter;
}
- public MetadataListFilter getMetadataListFilter() {
- return metadataListFilter;
- }
-
public AutoDetectParserConfig getAutoDetectParserConfig() {
return autoDetectParserConfig;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
index 7023c4cb8..fd1cd0332 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
@@ -26,7 +26,6 @@ import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.StringUtils;
@@ -53,7 +52,7 @@ import org.apache.tika.utils.StringUtils;
* will overwrite the value in that field. Again, if there are multiple
* values in that field, those will all be overwritten.
*/
-public class CaptureGroupMetadataFilter extends MetadataFilter implements
Initializable {
+public class CaptureGroupMetadataFilter extends MetadataFilterBase implements
Initializable {
private String regexString;
private Pattern regex;
@@ -61,7 +60,7 @@ public class CaptureGroupMetadataFilter extends
MetadataFilter implements Initia
private String targetField;
@Override
- public void filter(Metadata metadata) throws TikaException {
+ protected void filter(Metadata metadata) {
String val = metadata.get(sourceField);
if (StringUtils.isBlank(val)) {
return;
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
index 6157b4e5c..c4a912c98 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java
@@ -23,7 +23,6 @@ import java.util.Set;
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -32,7 +31,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
* attachment type matches one of the types. The idea is that you might not
want
* to store/transmit metadata for images or specific file types.
*/
-public class ClearByAttachmentTypeMetadataFilter extends MetadataFilter {
+public class ClearByAttachmentTypeMetadataFilter extends MetadataFilterBase {
private final Set<String> types;
public ClearByAttachmentTypeMetadataFilter() {
@@ -43,8 +42,8 @@ public class ClearByAttachmentTypeMetadataFilter extends
MetadataFilter {
this.types = types;
}
- @Override
- public void filter(Metadata metadata) throws TikaException {
+
+ protected void filter(Metadata metadata) {
String type = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (type == null) {
return;
@@ -87,4 +86,6 @@ public class ClearByAttachmentTypeMetadataFilter extends
MetadataFilter {
public List<String> getTypes() {
return new ArrayList<>(types);
}
+
+
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
index adcffc5ad..d97eeb0d8 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
@@ -22,7 +22,6 @@ import java.util.List;
import java.util.Set;
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -31,7 +30,7 @@ import org.apache.tika.mime.MediaType;
* mime matches the mime filter. The idea is that you might not want
* to store/transmit metadata for images or specific file types.
*/
-public class ClearByMimeMetadataFilter extends MetadataFilter {
+public class ClearByMimeMetadataFilter extends MetadataFilterBase {
private final Set<String> mimes;
public ClearByMimeMetadataFilter() {
@@ -43,7 +42,7 @@ public class ClearByMimeMetadataFilter extends MetadataFilter
{
}
@Override
- public void filter(Metadata metadata) throws TikaException {
+ protected void filter(Metadata metadata) {
String mimeString = metadata.get(Metadata.CONTENT_TYPE);
if (mimeString == null) {
return;
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
index 8f5907c44..3b306cd58 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
@@ -44,14 +44,15 @@ public class CompositeMetadataFilter extends MetadataFilter
{
}
@Override
- public void filter(Metadata metadata) throws TikaException {
+ public List<Metadata> filter(List<Metadata> metadataList) throws
TikaException {
for (MetadataFilter filter : filters) {
- filter.filter(metadata);
+ metadataList = filter.filter(metadataList);
}
+ return metadataList;
}
@Override
public String toString() {
- return "CompositeMetadataFilter{" + "filters=" + filters + '}';
+ return "CompositeMetadataListFilter{" + "filters=" + filters + '}';
}
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
index ed8280bcc..868280c56 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
@@ -27,7 +27,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
@@ -43,7 +42,7 @@ import org.apache.tika.metadata.Property;
* if the file format does not specify a timezone.
*
*/
-public class DateNormalizingMetadataFilter extends MetadataFilter {
+public class DateNormalizingMetadataFilter extends MetadataFilterBase {
private static TimeZone UTC = TimeZone.getTimeZone("UTC");
@@ -51,8 +50,7 @@ public class DateNormalizingMetadataFilter extends
MetadataFilter {
private TimeZone defaultTimeZone = UTC;
- @Override
- public void filter(Metadata metadata) throws TikaException {
+ protected void filter(Metadata metadata) {
SimpleDateFormat dateFormatter = null;
SimpleDateFormat utcFormatter = null;
for (String n : metadata.names()) {
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
index e00089910..9a2683934 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
@@ -22,10 +22,9 @@ import java.util.List;
import java.util.Set;
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-public class ExcludeFieldMetadataFilter extends MetadataFilter {
+public class ExcludeFieldMetadataFilter extends MetadataFilterBase {
private final Set<String> excludeSet;
public ExcludeFieldMetadataFilter() {
@@ -37,7 +36,7 @@ public class ExcludeFieldMetadataFilter extends
MetadataFilter {
}
@Override
- public void filter(Metadata metadata) throws TikaException {
+ protected void filter(Metadata metadata) {
for (String field : excludeSet) {
metadata.remove(field);
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
index fe346e74d..2e5ae7b33 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
@@ -20,17 +20,16 @@ import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-public class FieldNameMappingFilter extends MetadataFilter {
+public class FieldNameMappingFilter extends MetadataFilterBase {
Map<String, String> mappings = new LinkedHashMap<>();
boolean excludeUnmapped = true;
@Override
- public void filter(Metadata metadata) throws TikaException {
+ protected void filter(Metadata metadata) {
if (excludeUnmapped) {
for (String n : metadata.names()) {
if (mappings.containsKey(n)) {
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java
index ac48454cf..8e18da37c 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java
@@ -17,7 +17,6 @@
package org.apache.tika.metadata.filter;
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.utils.StringUtils;
@@ -29,7 +28,7 @@ import org.apache.tika.utils.StringUtils;
*
* If you need any other mappings, please open a ticket on our JIRA.
*/
-public class GeoPointMetadataFilter extends MetadataFilter {
+public class GeoPointMetadataFilter extends MetadataFilterBase {
String geoPointFieldName = "location";
@@ -49,7 +48,7 @@ public class GeoPointMetadataFilter extends MetadataFilter {
}
@Override
- public void filter(Metadata metadata) throws TikaException {
+ protected void filter(Metadata metadata) {
String lat = metadata.get(TikaCoreProperties.LATITUDE);
if (StringUtils.isEmpty(lat)) {
return;
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
index 3fe2a90fa..e0c5285c3 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
@@ -22,10 +22,9 @@ import java.util.List;
import java.util.Set;
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-public class IncludeFieldMetadataFilter extends MetadataFilter {
+public class IncludeFieldMetadataFilter extends MetadataFilterBase {
private final Set<String> includeSet;
public IncludeFieldMetadataFilter() {
@@ -49,7 +48,7 @@ public class IncludeFieldMetadataFilter extends
MetadataFilter {
}
@Override
- public void filter(Metadata metadata) throws TikaException {
+ protected void filter(Metadata metadata) {
for (String n : metadata.names()) {
if (!includeSet.contains(n)) {
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
index 21eb3eced..135838581 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
@@ -14,11 +14,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.tika.metadata.filter;
import java.io.IOException;
import java.io.Serializable;
+import java.util.List;
import org.w3c.dom.Element;
@@ -27,15 +27,9 @@ import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-/**
- * Filters the metadata in place after the parse
- *
- * @since Apache Tika 1.25
- */
public abstract class MetadataFilter extends ConfigBase implements
Serializable {
-
/**
- * Loads the metadata filter from the config file if it exists, otherwise
returns NoOpFilter
+ * Loads the metadata list filter from the config file if it exists,
otherwise returns NoOpFilter
* @param root
* @return
* @throws TikaConfigException
@@ -54,5 +48,14 @@ public abstract class MetadataFilter extends ConfigBase
implements Serializable
}
}
- public abstract void filter(Metadata metadata) throws TikaException;
+ /**
+ * For efficiency's sake, the original metadata list and data therein may
be modified.
+ * Users are responsible for doing a defensive copy before calling filter
if mutability
+ * would be problematic.
+ *
+ * @param metadataList
+ * @return
+ * @throws TikaException
+ */
+ public abstract List<Metadata> filter(List<Metadata> metadataList) throws
TikaException;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilterBase.java
similarity index 60%
rename from
tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java
rename to
tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilterBase.java
index 68654e4f2..dc5959ed2 100644
---
a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java
+++
b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilterBase.java
@@ -14,15 +14,27 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.metadata.listfilter;
+package org.apache.tika.metadata.filter;
import java.util.List;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-public class NoOpListFilter extends MetadataListFilter {
+/**
+ * Base class for iterating a call to {@link #filter(Metadata)} on a list
+ * of metadata objects. This should be used on context-free metadata filters
that
+ * do not require knowledge of more than a single metadata object at a time
+ */
+public abstract class MetadataFilterBase extends MetadataFilter {
+
@Override
- public List<Metadata> filter(List<Metadata> metadataList) {
+ public List<Metadata> filter(List<Metadata> metadataList) throws
TikaException {
+ for (Metadata m : metadataList) {
+ filter(m);
+ }
return metadataList;
}
+
+ protected abstract void filter(Metadata metadata);
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
index d95472a1b..f2a7091a8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.metadata.filter;
+import java.util.List;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -27,8 +29,9 @@ public class NoOpFilter extends MetadataFilter {
public static final NoOpFilter NOOP_FILTER = new NoOpFilter();
+
@Override
- public void filter(Metadata metadata) throws TikaException {
- //no op
+ public List<Metadata> filter(List<Metadata> metadataList) throws
TikaException {
+ return metadataList;
}
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java
deleted file mode 100644
index cede25bd5..000000000
---
a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.metadata.listfilter;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-
-public class CompositeMetadataListFilter extends MetadataListFilter {
-
- //no longer final to allow for no arg initialization during serialization
- private List<MetadataListFilter> filters;
-
- public CompositeMetadataListFilter() {
- filters = new ArrayList<>();
- }
- public CompositeMetadataListFilter(List<MetadataListFilter> filters) {
- this.filters = filters;
- }
-
- public void setFilters(List<MetadataListFilter> filters) {
- this.filters.clear();
- this.filters.addAll(filters);
- }
-
- public List<MetadataListFilter> getFilters() {
- return filters;
- }
-
- @Override
- public List<Metadata> filter(List<Metadata> metadataList) throws
TikaException {
- for (MetadataListFilter filter : filters) {
- metadataList = filter.filter(metadataList);
- }
- return metadataList;
- }
-
- @Override
- public String toString() {
- return "CompositeMetadataListFilter{" + "filters=" + filters + '}';
- }
-}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java
b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java
deleted file mode 100644
index 0735a98a1..000000000
---
a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.metadata.listfilter;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.List;
-
-import org.w3c.dom.Element;
-
-import org.apache.tika.config.ConfigBase;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-
-public abstract class MetadataListFilter extends ConfigBase implements
Serializable {
- /**
- * Loads the metadata list filter from the config file if it exists,
otherwise returns NoOpFilter
- * @param root
- * @return
- * @throws TikaConfigException
- * @throws IOException
- */
- public static MetadataListFilter load(Element root, boolean allowMissing)
throws TikaConfigException,
- IOException {
- try {
- return buildComposite("metadataListFilters",
CompositeMetadataListFilter.class,
- "metadataListFilter", MetadataListFilter.class, root);
- } catch (TikaConfigException e) {
- if (allowMissing && e.getMessage().contains("could not find
metadataListFilters")) {
- return new NoOpListFilter();
- }
- throw e;
- }
- }
- public abstract List<Metadata> filter(List<Metadata> metadataList) throws
TikaException;
-}
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 0d8671e94..b65fdbd61 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -103,7 +103,7 @@ public class RecursiveParserWrapperHandler extends
AbstractRecursiveParserWrappe
super.endEmbeddedDocument(contentHandler, metadata);
addContent(contentHandler, metadata);
try {
- metadataFilter.filter(metadata);
+ metadataFilter.filter(List.of(metadata));
} catch (TikaException e) {
throw new SAXException(e);
}
@@ -123,7 +123,7 @@ public class RecursiveParserWrapperHandler extends
AbstractRecursiveParserWrappe
super.endDocument(contentHandler, metadata);
addContent(contentHandler, metadata);
try {
- metadataFilter.filter(metadata);
+ metadataFilter.filter(List.of(metadata));
} catch (TikaException e) {
throw new SAXException(e);
}
diff --git
a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java
b/tika-core/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java
similarity index 91%
rename from
tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java
rename to
tika-core/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java
index daa68c928..e0e0c72f3 100644
---
a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java
+++
b/tika-core/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java
@@ -14,14 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.metadata.listfilter;
+package org.apache.tika.metadata.filter;
import java.util.List;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-public class AttachmentCountingListFilter extends MetadataListFilter {
+public class AttachmentCountingListFilter extends MetadataFilter {
@Override
public List<Metadata> filter(List<Metadata> metadataList) throws
TikaException {
if (metadataList == null || metadataList.isEmpty()) {
diff --git
a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java
b/tika-core/src/test/java/org/apache/tika/metadata/filter/MetadataListFilterTest.java
similarity index 86%
rename from
tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java
rename to
tika-core/src/test/java/org/apache/tika/metadata/filter/MetadataListFilterTest.java
index ad5aa1a19..653fad323 100644
---
a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java
+++
b/tika-core/src/test/java/org/apache/tika/metadata/filter/MetadataListFilterTest.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.metadata.listfilter;
+package org.apache.tika.metadata.filter;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -31,10 +31,10 @@ public class MetadataListFilterTest {
public void testBasic() throws Exception {
TikaConfig tikaConfig;
try (InputStream is = MetadataListFilterTest.class.getResourceAsStream(
- "metadatalistfilter-config.xml")) {
+ "metadatafilter-config.xml")) {
tikaConfig = new TikaConfig(is);
}
- CompositeMetadataListFilter compositeMetadataListFilter =
(CompositeMetadataListFilter) tikaConfig.getMetadataListFilter();
+ CompositeMetadataFilter compositeMetadataListFilter =
(CompositeMetadataFilter) tikaConfig.getMetadataFilter();
assertEquals(1, compositeMetadataListFilter.getFilters().size());
assertTrue(compositeMetadataListFilter.getFilters().get(0) instanceof
AttachmentCountingListFilter);
}
diff --git
a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
index ac64734c2..517fb120c 100644
---
a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
+++
b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
@@ -18,16 +18,15 @@ package org.apache.tika.metadata.filter;
import java.util.Locale;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
/**
* Mock Filter for testing uppercasing of all values
*/
-public class MockUpperCaseFilter extends MetadataFilter {
+public class MockUpperCaseFilter extends MetadataFilterBase {
@Override
- public void filter(Metadata metadata) throws TikaException {
+ protected void filter(Metadata metadata) {
for (String n : metadata.names()) {
String[] vals = metadata.getValues(n);
metadata.remove(n);
diff --git
a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index 91e4bd3be..51a940285 100644
---
a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++
b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -23,12 +23,14 @@ import static org.junit.jupiter.api.Assertions.assertNull;
import java.util.Arrays;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.Test;
import org.apache.tika.config.AbstractTikaConfigTest;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -46,7 +48,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("author", "author");
MetadataFilter defaultFilter = new DefaultMetadataFilter();
- defaultFilter.filter(metadata);
+ metadata = filterOne(defaultFilter, metadata);
assertEquals(2, metadata.names().length);
assertEquals("title", metadata.get("title"));
@@ -60,7 +62,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("author", "author");
MetadataFilter filter = new IncludeFieldMetadataFilter(set("title"));
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(1, metadata.names().length);
assertEquals("title", metadata.get("title"));
assertNull(metadata.get("author"));
@@ -73,7 +75,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("author", "author");
MetadataFilter filter = new ExcludeFieldMetadataFilter(set("title"));
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(1, metadata.names().length);
assertEquals("author", metadata.get("author"));
assertNull(metadata.get("title"));
@@ -87,7 +89,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("author", "author");
metadata.set("content", "content");
- config.getMetadataFilter().filter(metadata);
+ metadata = filterOne(config.getMetadataFilter(), metadata);
assertEquals(2, metadata.size());
assertEquals("title", metadata.get("title"));
@@ -102,8 +104,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("author", "author");
metadata.set("content", "content");
- config.getMetadataFilter().filter(metadata);
-
+ metadata = filterOne(config.getMetadataFilter(), metadata);
assertEquals(1, metadata.size());
assertEquals("content", metadata.get("content"));
}
@@ -119,7 +120,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("author", "author");
metadata.set("content", "content");
- config.getMetadataFilter().filter(metadata);
+ metadata = filterOne(config.getMetadataFilter(), metadata);
assertEquals(2, metadata.size());
assertArrayEquals(expectedTitles, metadata.getValues("title"));
@@ -133,12 +134,12 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("author", "author");
MetadataFilter filter = new
ClearByMimeMetadataFilter(set("image/jpeg", "application/pdf"));
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(0, metadata.size());
metadata.set(Metadata.CONTENT_TYPE,
MediaType.text("plain").toString());
metadata.set("author", "author");
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(2, metadata.size());
assertEquals("author", metadata.get("author"));
@@ -153,12 +154,12 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("author", "author");
MetadataFilter filter = config.getMetadataFilter();
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(0, metadata.size());
metadata.set(Metadata.CONTENT_TYPE,
MediaType.text("plain").toString());
metadata.set("author", "author");
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(2, metadata.size());
assertEquals("AUTHOR", metadata.get("author"));
}
@@ -173,7 +174,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set("a", "a-value");
MetadataFilter filter = config.getMetadataFilter();
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals("quick brown fox", metadata.get("content"));
assertEquals("a-value", metadata.get("b"));
assertNull(metadata.get("author"));
@@ -201,7 +202,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
MetadataFilter filter = config.getMetadataFilter();
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals("quick brown fox",
metadata.get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("text/html", metadata.get("mime"));
}
@@ -215,7 +216,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set(Metadata.CONTENT_TYPE, "text/html");
MetadataFilter filter = config.getMetadataFilter();
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals("quick brown fox",
metadata.get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("text/html", metadata.get("mime"));
}
@@ -229,7 +230,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
MetadataFilter filter = config.getMetadataFilter();
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals("quick brown fox",
metadata.get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
@@ -239,7 +240,7 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "text/plain;
charset=UTF-8");
metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(),
"application/pdf; charset=UTF-8");
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
}
@@ -252,15 +253,18 @@ public class TestMetadataFilter extends
AbstractTikaConfigTest {
metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
MetadataFilter filter = config.getMetadataFilter();
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(0, metadata.names().length);
metadata = new Metadata();
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK
.name());
metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
- filter.filter(metadata);
+ metadata = filterOne(filter, metadata);
assertEquals(2, metadata.names().length);
}
+ private static Metadata filterOne(MetadataFilter filter, Metadata
singleMetadata) throws TikaException {
+ return filter.filter(List.of(singleMetadata)).get(0);
+ }
}
diff --git
a/tika-core/src/test/resources/org/apache/tika/metadata/listfilter/metadatalistfilter-config.xml
b/tika-core/src/test/resources/org/apache/tika/metadata/filter/metadatafilter-config.xml
similarity index 83%
rename from
tika-core/src/test/resources/org/apache/tika/metadata/listfilter/metadatalistfilter-config.xml
rename to
tika-core/src/test/resources/org/apache/tika/metadata/filter/metadatafilter-config.xml
index 55d25e58b..122a3696c 100644
---
a/tika-core/src/test/resources/org/apache/tika/metadata/listfilter/metadatalistfilter-config.xml
+++
b/tika-core/src/test/resources/org/apache/tika/metadata/filter/metadatafilter-config.xml
@@ -16,7 +16,7 @@
limitations under the License.
-->
<properties>
- <metadataListFilters>
- <metadataListFilter
class="org.apache.tika.metadata.listfilter.AttachmentCountingListFilter"/>
- </metadataListFilters>
+ <metadataFilters>
+ <metadataFilter
class="org.apache.tika.metadata.filter.AttachmentCountingListFilter"/>
+ </metadataFilters>
</properties>
\ No newline at end of file
diff --git
a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java
b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java
index 811958af4..5e788d959 100644
---
a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java
+++
b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java
@@ -29,14 +29,13 @@ import
org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator;
import org.apache.tika.eval.core.textstats.TextStatsCalculator;
import org.apache.tika.eval.core.tokens.CommonTokenResult;
import org.apache.tika.eval.core.tokens.TokenCounts;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.MetadataFilterBase;
-public class TikaEvalMetadataFilter extends MetadataFilter {
+public class TikaEvalMetadataFilter extends MetadataFilterBase {
public static String TIKA_EVAL_NS = "tika-eval" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
@@ -73,7 +72,7 @@ public class TikaEvalMetadataFilter extends MetadataFilter {
@Override
- public void filter(Metadata metadata) throws TikaException {
+ public void filter(Metadata metadata) {
String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
if (StringUtils.isAllBlank(content)) {
return;
diff --git
a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
index f1fd21c21..1374ac0eb 100644
---
a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
+++
b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
@@ -18,6 +18,8 @@ package org.apache.tika.eval.core.metadata;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import java.util.List;
+
import org.junit.jupiter.api.Test;
import org.apache.tika.metadata.Metadata;
@@ -36,7 +38,7 @@ public class TikaEvalMetadataFilterTest {
String content = "the quick brown fox, Zothro 1234 1235, jumped
over the lazy dog";
metadata.set(TikaCoreProperties.TIKA_CONTENT, content);
- filter.filter(metadata);
+ metadata = filter.filter(List.of(metadata)).get(0);
assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE));
assertEquals(12, (int)
metadata.getInt(TikaEvalMetadataFilter.NUM_TOKENS));
assertEquals(11, (int)
metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS));
diff --git
a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java
b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java
index e0f88023e..b7becc88d 100644
---
a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java
+++
b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java
@@ -17,14 +17,13 @@
package org.apache.tika.langdetect.opennlp.metadatafilter;
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.langdetect.opennlp.OpenNLPDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.MetadataFilterBase;
-public class OpenNLPMetadataFilter extends MetadataFilter {
+public class OpenNLPMetadataFilter extends MetadataFilterBase {
private int maxCharsForDetection = 10000;
@@ -34,7 +33,7 @@ public class OpenNLPMetadataFilter extends MetadataFilter {
}
@Override
- public void filter(Metadata metadata) throws TikaException {
+ public void filter(Metadata metadata) {
OpenNLPDetector detector = new OpenNLPDetector();
detector.setMaxLength(maxCharsForDetection);
String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
diff --git
a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
index a0e3dd6c7..cb850d2de 100644
---
a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
+++
b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
@@ -17,14 +17,13 @@
package org.apache.tika.langdetect.optimaize.metadatafilter;
import org.apache.tika.config.Field;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.MetadataFilterBase;
-public class OptimaizeMetadataFilter extends MetadataFilter {
+public class OptimaizeMetadataFilter extends MetadataFilterBase {
private int maxCharsForDetection =
OptimaizeLangDetector.DEFAULT_MAX_CHARS_FOR_DETECTION;
@@ -34,7 +33,7 @@ public class OptimaizeMetadataFilter extends MetadataFilter {
}
@Override
- public void filter(Metadata metadata) throws TikaException {
+ public void filter(Metadata metadata) {
OptimaizeLangDetector detector = new
OptimaizeLangDetector(maxCharsForDetection);
detector.loadModels();
String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java
index af7150354..5fa033929 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java
@@ -18,11 +18,11 @@ package org.apache.tika.pipes.core;
import java.io.Serializable;
-import org.apache.tika.metadata.listfilter.MetadataListFilter;
+import org.apache.tika.metadata.filter.MetadataFilter;
/**
* Filter/Select some of the emitted output and pass it back to the client
parser.
*/
-public abstract class PassbackFilter extends MetadataListFilter implements
Serializable {
+public abstract class PassbackFilter extends MetadataFilter implements
Serializable {
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java
index 40dab9cfe..1d0e83b9b 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java
@@ -60,8 +60,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.filter.MetadataFilter;
-import org.apache.tika.metadata.listfilter.MetadataListFilter;
-import org.apache.tika.metadata.listfilter.NoOpListFilter;
+import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
@@ -482,8 +481,7 @@ public class PipesServer implements Runnable {
long start = System.currentTimeMillis();
String stack = getContainerStacktrace(t, parseData.getMetadataList());
//we need to apply the metadata filter after we pull out the stacktrace
- filterMetadata(t, parseData.getMetadataList());
- filterMetadataList(t, parseData);
+ filterMetadata(t, parseData);
ParseContext parseContext = t.getParseContext();
FetchEmitTuple.ON_PARSE_EXCEPTION onParseException =
t.getOnParseException();
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig =
parseContext.get(EmbeddedDocumentBytesConfig.class);
@@ -527,26 +525,12 @@ public class PipesServer implements Runnable {
return false;
}
- private void filterMetadata(FetchEmitTuple t, List<Metadata> metadataList)
{
+ private void filterMetadata(FetchEmitTuple t, MetadataListAndEmbeddedBytes
parseData) {
MetadataFilter filter = t.getParseContext().get(MetadataFilter.class);
if (filter == null) {
filter = tikaConfig.getMetadataFilter();
}
- for (Metadata m : metadataList) {
- try {
- filter.filter(m);
- } catch (TikaException e) {
- LOG.warn("failed to filter metadata", e);
- }
- }
- }
-
- private void filterMetadataList(FetchEmitTuple t,
MetadataListAndEmbeddedBytes parseData) {
- MetadataListFilter filter =
t.getParseContext().get(MetadataListFilter.class);
- if (filter == null) {
- filter = tikaConfig.getMetadataListFilter();
- }
- if (filter instanceof NoOpListFilter) {
+ if (filter instanceof NoOpFilter) {
return;
}
try {
@@ -956,7 +940,7 @@ public class PipesServer implements Runnable {
return metadataList;
}
- public void filter(MetadataListFilter filter) throws TikaException {
+ public void filter(MetadataFilter filter) throws TikaException {
metadataList = filter.filter(metadataList);
}
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
index 04c466c1d..9cc33478f 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
@@ -29,12 +29,10 @@ import org.junit.jupiter.api.io.TempDir;
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.AttachmentCountingListFilter;
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.MockUpperCaseFilter;
-import org.apache.tika.metadata.listfilter.AttachmentCountingListFilter;
-import org.apache.tika.metadata.listfilter.CompositeMetadataListFilter;
-import org.apache.tika.metadata.listfilter.MetadataListFilter;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.api.FetchEmitTuple;
import org.apache.tika.pipes.api.PipesResult;
@@ -88,8 +86,8 @@ public class PipesClientTest {
@Test
public void testMetadataListFilter(@TempDir Path tmp) throws Exception {
ParseContext parseContext = new ParseContext();
- MetadataListFilter metadataFilter = new
CompositeMetadataListFilter(List.of(new AttachmentCountingListFilter()));
- parseContext.set(MetadataListFilter.class, metadataFilter);
+ MetadataFilter metadataFilter = new
CompositeMetadataFilter(List.of(new AttachmentCountingListFilter()));
+ parseContext.set(MetadataFilter.class, metadataFilter);
String testFile = "mock-embedded.xml";
@@ -111,8 +109,8 @@ public class PipesClientTest {
//I did both manually during development, but unit tests are better. :D
ParseContext parseContext = new ParseContext();
parseContext.set(TikaTaskTimeout.class, new TikaTaskTimeout(1000));
- MetadataListFilter metadataFilter = new
CompositeMetadataListFilter(List.of(new AttachmentCountingListFilter()));
- parseContext.set(MetadataListFilter.class, metadataFilter);
+ MetadataFilter metadataFilter = new
CompositeMetadataFilter(List.of(new AttachmentCountingListFilter()));
+ parseContext.set(MetadataFilter.class, metadataFilter);
String testFile = "mock-timeout-10s.xml";
PipesClient pipesClient = init(tmp, testFile);
diff --git
a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java
b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java
index a0fba59e9..d3090a36f 100644
---
a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java
+++
b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.pipes.emitter.solr;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -66,7 +67,7 @@ public class SolrEmitterDevTest {
mappings.put(TikaCoreProperties.CREATED.getName(), "created");
mappings.put(TikaCoreProperties.TIKA_CONTENT.getName(), "content");
filter.setMappings(mappings);
- filter.filter(metadata);
+ metadata = filter.filter(List.of(metadata)).get(0);
solrEmitter.emit(emitKey, Collections.singletonList(metadata), new
ParseContext());
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 483596199..867481d45 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -76,7 +76,7 @@ public class TikaLoader {
private Parser parsers;
private Detector detectors;
private EncodingDetector encodingDetectors;
- private MetadataFilter metadataFilters;
+ private MetadataFilter metadataFilter;
private Renderer renderers;
private TikaLoader(TikaJsonConfig config, ClassLoader classLoader,
@@ -193,14 +193,14 @@ public class TikaLoader {
* @throws TikaConfigException if loading fails
*/
public synchronized MetadataFilter loadMetadataFilters() throws
TikaConfigException {
- if (metadataFilters == null) {
+ if (metadataFilter == null) {
CompositeComponentLoader<MetadataFilter> loader = new
CompositeComponentLoader<>(
MetadataFilter.class, "metadataFilters",
"metadata-filters",
classLoader, objectMapper);
List<MetadataFilter> filterList = loader.loadFromArray(config);
- metadataFilters = new CompositeMetadataFilter(filterList);
+ metadataFilter = new CompositeMetadataFilter(filterList);
}
- return metadataFilters;
+ return metadataFilter;
}
/**
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index e0ba6279a..f620f7694 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -41,7 +41,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.listfilter.MetadataListFilter;
+import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
@@ -87,8 +87,10 @@ public class RecursiveMetadataResource {
//we shouldn't get here?
LOG.error("something went seriously wrong", e);
}
- MetadataListFilter metadataListFilter =
context.get(MetadataListFilter.class, getConfig().getMetadataListFilter());
- return metadataListFilter.filter(handler.getMetadataList());
+ MetadataFilter metadataFilter = context.get(MetadataFilter.class,
getConfig().getMetadataFilter());
+ //note that the filter may modify the contents of handler's metadata
list.
+ //do a deep copy if that's problematic.
+ return metadataFilter.filter(handler.getMetadataList());
}
static HandlerConfig buildHandlerConfig(MultivaluedMap<String, String>
httpHeaders, String handlerTypeName, HandlerConfig.PARSE_MODE parseMode) {
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 057c797b2..fcbb24c87 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -551,11 +551,14 @@ public class TikaResource {
throws IOException, TikaException {
Metadata metadata = new Metadata();
parseToMetadata(getInputStream(att.getObject(InputStream.class),
metadata, httpHeaders, info), metadata, preparePostHeaderMap(att, httpHeaders),
info, handlerTypeName);
- TikaResource
+ List<Metadata> ret = TikaResource
.getConfig()
.getMetadataFilter()
- .filter(metadata);
- return metadata;
+ .filter(List.of(metadata));
+ if (ret == null || ret.isEmpty()) {
+ return new Metadata();
+ }
+ return ret.get(0);
}
@PUT
@@ -566,10 +569,13 @@ public class TikaResource {
throws IOException, TikaException {
Metadata metadata = new Metadata();
parseToMetadata(getInputStream(is, metadata, httpHeaders, info),
metadata, httpHeaders.getRequestHeaders(), info, handlerTypeName);
- TikaResource
+ List<Metadata> ret = TikaResource
.getConfig()
.getMetadataFilter()
- .filter(metadata);
+ .filter(List.of(metadata));
+ if (ret == null || ret.isEmpty()) {
+ return new Metadata();
+ }
return metadata;
}