This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4546 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 319970eb1ce09acf89b99b4aaa0b9b0c4dd27d2f Author: tallison <[email protected]> AuthorDate: Tue Nov 25 14:10:18 2025 -0500 TIKA-4546 -- simplify metadata filtering --- CHANGES.txt | 18 ++++++- .../tika/annotation/TikaComponentProcessor.java | 2 +- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +- .../src/main/java/org/apache/tika/gui/TikaGUI.java | 2 +- .../java/org/apache/tika/config/TikaConfig.java | 11 ---- .../filter/CaptureGroupMetadataFilter.java | 5 +- .../ClearByAttachmentTypeMetadataFilter.java | 9 ++-- .../metadata/filter/ClearByMimeMetadataFilter.java | 5 +- .../metadata/filter/CompositeMetadataFilter.java | 7 +-- .../filter/DateNormalizingMetadataFilter.java | 6 +-- .../filter/ExcludeFieldMetadataFilter.java | 5 +- .../metadata/filter/FieldNameMappingFilter.java | 5 +- .../metadata/filter/GeoPointMetadataFilter.java | 5 +- .../filter/IncludeFieldMetadataFilter.java | 5 +- .../tika/metadata/filter/MetadataFilter.java | 21 ++++---- .../MetadataFilterBase.java} | 18 +++++-- .../apache/tika/metadata/filter/NoOpFilter.java | 7 ++- .../listfilter/CompositeMetadataListFilter.java | 58 ---------------------- .../metadata/listfilter/MetadataListFilter.java | 51 ------------------- .../tika/sax/RecursiveParserWrapperHandler.java | 4 +- .../AttachmentCountingListFilter.java | 4 +- .../MetadataListFilterTest.java | 6 +-- .../tika/metadata/filter/MockUpperCaseFilter.java | 5 +- .../tika/metadata/filter/TestMetadataFilter.java | 40 ++++++++------- .../metadatafilter-config.xml} | 6 +-- .../eval/core/metadata/TikaEvalMetadataFilter.java | 7 ++- .../core/metadata/TikaEvalMetadataFilterTest.java | 4 +- .../metadatafilter/OpenNLPMetadataFilter.java | 7 ++- .../metadatafilter/OptimaizeMetadataFilter.java | 7 ++- .../org/apache/tika/pipes/core/PassbackFilter.java | 4 +- .../org/apache/tika/pipes/core/PipesServer.java | 26 ++-------- .../apache/tika/pipes/core/PipesClientTest.java | 12 ++--- .../pipes/emitter/solr/SolrEmitterDevTest.java | 3 +- .../org/apache/tika/config/loader/TikaLoader.java | 8 +-- .../core/resource/RecursiveMetadataResource.java | 8 +-- .../tika/server/core/resource/TikaResource.java | 16 ++++-- 36 files changed, 154 insertions(+), 255 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 2d91e48c2..bea9b5235 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,16 +1,30 @@ Release 4.0.0-BETA1 - ??? + BREAKING CHANGES + * Moved towards default json based configuration (TIKA-4544 and many others). + + * tika-pipes implementation modules have been reorganized by + resource (tika-pipes-solr) vs task (tika-pipes-fetcher-solr) + (TIKA-4543). Note that the file-system pipes components have + been taken out of tika-pipes-core and placed in their own + pf4j module: tika-pipes-file-system. + + * tika-pipes implementation modules are now pf4j plugins (TIKA-4519). + * tika-pipes core classes have been moved to a new module: tika-pipes-core, and the FileSystem pipes components have moved (TIKA-4334). - * Headers are no longer injected into the body/content of MSG files (TIKA-4345). Please open - a ticket if you need this behavior across email formats. + * MetadataListFilter has been renamed MetadataFilter, and + MetadataFilter has been removed (TIKA-4546). * Removed several modules, including: tika-batch (TIKA-4333), snaps deployment (TIKA-4502), dotnet (TIKA-4332), advanced media module (TIKA-4500), tika-dl module (TIKA-4499), tika-fuzzing module (TIKA-4506). + * Headers are no longer injected into the body/content of MSG files (TIKA-4345). Please open + a ticket if you need this behavior across email formats. + * API changes in the EmbeddedStreamTranslator (TIKA-4518). OTHER CHANGES diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java index 3a1800679..f37ac1e3f 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -70,7 +70,7 @@ public class TikaComponentProcessor extends AbstractProcessor { SERVICE_INTERFACES.put("org.apache.tika.detect.EncodingDetector", "encoding-detectors"); SERVICE_INTERFACES.put("org.apache.tika.language.translate.Translator", "translators"); SERVICE_INTERFACES.put("org.apache.tika.renderer.Renderer", "renderers"); - SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter", "metadata-filters"); + SERVICE_INTERFACES.put("org.apache.tika.metadata.listfilter.MetadataFilter", "metadata-filters"); } private Messager messager; diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 2cb3f7f5d..050c00dfe 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -525,7 +525,7 @@ public class TikaCLI { JsonMetadataList.setPrettyPrinting(prettyPrint); try (Writer writer = getOutputWriter(output, encoding)) { List<Metadata> metadataList = handler.getMetadataList(); - metadataList = config.getMetadataListFilter().filter(metadataList); + metadataList = config.getMetadataFilter().filter(metadataList); JsonMetadataList.toJson(metadataList, writer); } } diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index 1da9cda9c..a2a4d526e 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -384,7 +384,7 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true); List<Metadata> metadataList = recursiveParserWrapperHandler.getMetadataList(); - metadataList = tikaConfig.getMetadataListFilter().filter(metadataList); + metadataList = tikaConfig.getMetadataFilter().filter(metadataList); JsonMetadataList.toJson(metadataList, jsonBuffer); setText(json, jsonBuffer.toString()); } diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index 14ff21b11..fc7e84d78 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -64,8 +64,6 @@ import org.apache.tika.language.translate.DefaultTranslator; import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; -import org.apache.tika.metadata.listfilter.MetadataListFilter; -import org.apache.tika.metadata.listfilter.NoOpListFilter; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypeException; @@ -108,7 +106,6 @@ public class TikaConfig { private final EncodingDetector encodingDetector; private final Renderer renderer; private final MetadataFilter metadataFilter; - private final MetadataListFilter metadataListFilter; private final AutoDetectParserConfig autoDetectParserConfig; private static final Map<String, InitializableProblemHandler> strategyMap = new HashMap<>(); @@ -191,7 +188,6 @@ public class TikaConfig { this.translator = translatorLoader.loadOverall(element, mimeTypes, loader); this.executorService = executorLoader.loadOverall(element, mimeTypes, loader); this.metadataFilter = MetadataFilter.load(element, true); - this.metadataListFilter = MetadataListFilter.load(element, true); this.autoDetectParserConfig = AutoDetectParserConfig.load(element); this.serviceLoader = loader; setMaxJsonStringFieldLength(element); @@ -220,7 +216,6 @@ public class TikaConfig { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); this.metadataFilter = new NoOpFilter(); - this.metadataListFilter = new NoOpListFilter(); this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT; TIMES_INSTANTIATED.incrementAndGet(); } @@ -267,7 +262,6 @@ public class TikaConfig { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); this.metadataFilter = new NoOpFilter(); - this.metadataListFilter = new NoOpListFilter(); this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT; } else { ServiceLoader tmpServiceLoader = new ServiceLoader(); @@ -295,7 +289,6 @@ public class TikaConfig { this.executorService = executorLoader.loadOverall(element, mimeTypes, serviceLoader); this.metadataFilter = MetadataFilter.load(element, true); - this.metadataListFilter = MetadataListFilter.load(element, true); this.autoDetectParserConfig = AutoDetectParserConfig.load(element); setMaxJsonStringFieldLength(element); } catch (SAXException e) { @@ -644,10 +637,6 @@ public class TikaConfig { return metadataFilter; } - public MetadataListFilter getMetadataListFilter() { - return metadataListFilter; - } - public AutoDetectParserConfig getAutoDetectParserConfig() { return autoDetectParserConfig; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java index 7023c4cb8..fd1cd0332 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java @@ -26,7 +26,6 @@ import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.StringUtils; @@ -53,7 +52,7 @@ import org.apache.tika.utils.StringUtils; * will overwrite the value in that field. Again, if there are multiple * values in that field, those will all be overwritten. */ -public class CaptureGroupMetadataFilter extends MetadataFilter implements Initializable { +public class CaptureGroupMetadataFilter extends MetadataFilterBase implements Initializable { private String regexString; private Pattern regex; @@ -61,7 +60,7 @@ public class CaptureGroupMetadataFilter extends MetadataFilter implements Initia private String targetField; @Override - public void filter(Metadata metadata) throws TikaException { + protected void filter(Metadata metadata) { String val = metadata.get(sourceField); if (StringUtils.isBlank(val)) { return; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java index 6157b4e5c..c4a912c98 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java @@ -23,7 +23,6 @@ import java.util.Set; import org.apache.tika.config.Field; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -32,7 +31,7 @@ import org.apache.tika.metadata.TikaCoreProperties; * attachment type matches one of the types. The idea is that you might not want * to store/transmit metadata for images or specific file types. */ -public class ClearByAttachmentTypeMetadataFilter extends MetadataFilter { +public class ClearByAttachmentTypeMetadataFilter extends MetadataFilterBase { private final Set<String> types; public ClearByAttachmentTypeMetadataFilter() { @@ -43,8 +42,8 @@ public class ClearByAttachmentTypeMetadataFilter extends MetadataFilter { this.types = types; } - @Override - public void filter(Metadata metadata) throws TikaException { + + protected void filter(Metadata metadata) { String type = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); if (type == null) { return; @@ -87,4 +86,6 @@ public class ClearByAttachmentTypeMetadataFilter extends MetadataFilter { public List<String> getTypes() { return new ArrayList<>(types); } + + } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java index adcffc5ad..d97eeb0d8 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java @@ -22,7 +22,6 @@ import java.util.List; import java.util.Set; import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -31,7 +30,7 @@ import org.apache.tika.mime.MediaType; * mime matches the mime filter. The idea is that you might not want * to store/transmit metadata for images or specific file types. */ -public class ClearByMimeMetadataFilter extends MetadataFilter { +public class ClearByMimeMetadataFilter extends MetadataFilterBase { private final Set<String> mimes; public ClearByMimeMetadataFilter() { @@ -43,7 +42,7 @@ public class ClearByMimeMetadataFilter extends MetadataFilter { } @Override - public void filter(Metadata metadata) throws TikaException { + protected void filter(Metadata metadata) { String mimeString = metadata.get(Metadata.CONTENT_TYPE); if (mimeString == null) { return; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java index 8f5907c44..3b306cd58 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java @@ -44,14 +44,15 @@ public class CompositeMetadataFilter extends MetadataFilter { } @Override - public void filter(Metadata metadata) throws TikaException { + public List<Metadata> filter(List<Metadata> metadataList) throws TikaException { for (MetadataFilter filter : filters) { - filter.filter(metadata); + metadataList = filter.filter(metadataList); } + return metadataList; } @Override public String toString() { - return "CompositeMetadataFilter{" + "filters=" + filters + '}'; + return "CompositeMetadataListFilter{" + "filters=" + filters + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java index ed8280bcc..868280c56 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java @@ -27,7 +27,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -43,7 +42,7 @@ import org.apache.tika.metadata.Property; * if the file format does not specify a timezone. * */ -public class DateNormalizingMetadataFilter extends MetadataFilter { +public class DateNormalizingMetadataFilter extends MetadataFilterBase { private static TimeZone UTC = TimeZone.getTimeZone("UTC"); @@ -51,8 +50,7 @@ public class DateNormalizingMetadataFilter extends MetadataFilter { private TimeZone defaultTimeZone = UTC; - @Override - public void filter(Metadata metadata) throws TikaException { + protected void filter(Metadata metadata) { SimpleDateFormat dateFormatter = null; SimpleDateFormat utcFormatter = null; for (String n : metadata.names()) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java index e00089910..9a2683934 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java @@ -22,10 +22,9 @@ import java.util.List; import java.util.Set; import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -public class ExcludeFieldMetadataFilter extends MetadataFilter { +public class ExcludeFieldMetadataFilter extends MetadataFilterBase { private final Set<String> excludeSet; public ExcludeFieldMetadataFilter() { @@ -37,7 +36,7 @@ public class ExcludeFieldMetadataFilter extends MetadataFilter { } @Override - public void filter(Metadata metadata) throws TikaException { + protected void filter(Metadata metadata) { for (String field : excludeSet) { metadata.remove(field); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java index fe346e74d..2e5ae7b33 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java @@ -20,17 +20,16 @@ import java.util.LinkedHashMap; import java.util.Map; import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -public class FieldNameMappingFilter extends MetadataFilter { +public class FieldNameMappingFilter extends MetadataFilterBase { Map<String, String> mappings = new LinkedHashMap<>(); boolean excludeUnmapped = true; @Override - public void filter(Metadata metadata) throws TikaException { + protected void filter(Metadata metadata) { if (excludeUnmapped) { for (String n : metadata.names()) { if (mappings.containsKey(n)) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java index ac48454cf..8e18da37c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java @@ -17,7 +17,6 @@ package org.apache.tika.metadata.filter; import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; @@ -29,7 +28,7 @@ import org.apache.tika.utils.StringUtils; * * If you need any other mappings, please open a ticket on our JIRA. */ -public class GeoPointMetadataFilter extends MetadataFilter { +public class GeoPointMetadataFilter extends MetadataFilterBase { String geoPointFieldName = "location"; @@ -49,7 +48,7 @@ public class GeoPointMetadataFilter extends MetadataFilter { } @Override - public void filter(Metadata metadata) throws TikaException { + protected void filter(Metadata metadata) { String lat = metadata.get(TikaCoreProperties.LATITUDE); if (StringUtils.isEmpty(lat)) { return; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java index 3fe2a90fa..e0c5285c3 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java @@ -22,10 +22,9 @@ import java.util.List; import java.util.Set; import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -public class IncludeFieldMetadataFilter extends MetadataFilter { +public class IncludeFieldMetadataFilter extends MetadataFilterBase { private final Set<String> includeSet; public IncludeFieldMetadataFilter() { @@ -49,7 +48,7 @@ public class IncludeFieldMetadataFilter extends MetadataFilter { } @Override - public void filter(Metadata metadata) throws TikaException { + protected void filter(Metadata metadata) { for (String n : metadata.names()) { if (!includeSet.contains(n)) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java index 21eb3eced..135838581 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java @@ -14,11 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.tika.metadata.filter; import java.io.IOException; import java.io.Serializable; +import java.util.List; import org.w3c.dom.Element; @@ -27,15 +27,9 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -/** - * Filters the metadata in place after the parse - * - * @since Apache Tika 1.25 - */ public abstract class MetadataFilter extends ConfigBase implements Serializable { - /** - * Loads the metadata filter from the config file if it exists, otherwise returns NoOpFilter + * Loads the metadata list filter from the config file if it exists, otherwise returns NoOpFilter * @param root * @return * @throws TikaConfigException @@ -54,5 +48,14 @@ public abstract class MetadataFilter extends ConfigBase implements Serializable } } - public abstract void filter(Metadata metadata) throws TikaException; + /** + * For efficiency's sake, the original metadata list and data therein may be modified. + * Users are responsible for doing a defensive copy before calling filter if mutability + * would be problematic. + * + * @param metadataList + * @return + * @throws TikaException + */ + public abstract List<Metadata> filter(List<Metadata> metadataList) throws TikaException; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilterBase.java similarity index 60% rename from tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java rename to tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilterBase.java index 68654e4f2..dc5959ed2 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilterBase.java @@ -14,15 +14,27 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.metadata.listfilter; +package org.apache.tika.metadata.filter; import java.util.List; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -public class NoOpListFilter extends MetadataListFilter { +/** + * Base class for iterating a call to {@link #filter(Metadata)} on a list + * of metadata objects. This should be used on context-free metadata filters that + * do not require knowledge of more than a single metadata object at a time + */ +public abstract class MetadataFilterBase extends MetadataFilter { + @Override - public List<Metadata> filter(List<Metadata> metadataList) { + public List<Metadata> filter(List<Metadata> metadataList) throws TikaException { + for (Metadata m : metadataList) { + filter(m); + } return metadataList; } + + protected abstract void filter(Metadata metadata); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java index d95472a1b..f2a7091a8 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java @@ -16,6 +16,8 @@ */ package org.apache.tika.metadata.filter; +import java.util.List; + import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -27,8 +29,9 @@ public class NoOpFilter extends MetadataFilter { public static final NoOpFilter NOOP_FILTER = new NoOpFilter(); + @Override - public void filter(Metadata metadata) throws TikaException { - //no op + public List<Metadata> filter(List<Metadata> metadataList) throws TikaException { + return metadataList; } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java deleted file mode 100644 index cede25bd5..000000000 --- a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.metadata.listfilter; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; - -public class CompositeMetadataListFilter extends MetadataListFilter { - - //no longer final to allow for no arg initialization during serialization - private List<MetadataListFilter> filters; - - public CompositeMetadataListFilter() { - filters = new ArrayList<>(); - } - public CompositeMetadataListFilter(List<MetadataListFilter> filters) { - this.filters = filters; - } - - public void setFilters(List<MetadataListFilter> filters) { - this.filters.clear(); - this.filters.addAll(filters); - } - - public List<MetadataListFilter> getFilters() { - return filters; - } - - @Override - public List<Metadata> filter(List<Metadata> metadataList) throws TikaException { - for (MetadataListFilter filter : filters) { - metadataList = filter.filter(metadataList); - } - return metadataList; - } - - @Override - public String toString() { - return "CompositeMetadataListFilter{" + "filters=" + filters + '}'; - } -} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java deleted file mode 100644 index 0735a98a1..000000000 --- a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.metadata.listfilter; - -import java.io.IOException; -import java.io.Serializable; -import java.util.List; - -import org.w3c.dom.Element; - -import org.apache.tika.config.ConfigBase; -import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; - -public abstract class MetadataListFilter extends ConfigBase implements Serializable { - /** - * Loads the metadata list filter from the config file if it exists, otherwise returns NoOpFilter - * @param root - * @return - * @throws TikaConfigException - * @throws IOException - */ - public static MetadataListFilter load(Element root, boolean allowMissing) throws TikaConfigException, - IOException { - try { - return buildComposite("metadataListFilters", CompositeMetadataListFilter.class, - "metadataListFilter", MetadataListFilter.class, root); - } catch (TikaConfigException e) { - if (allowMissing && e.getMessage().contains("could not find metadataListFilters")) { - return new NoOpListFilter(); - } - throw e; - } - } - public abstract List<Metadata> filter(List<Metadata> metadataList) throws TikaException; -} diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java index 0d8671e94..b65fdbd61 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java @@ -103,7 +103,7 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe super.endEmbeddedDocument(contentHandler, metadata); addContent(contentHandler, metadata); try { - metadataFilter.filter(metadata); + metadataFilter.filter(List.of(metadata)); } catch (TikaException e) { throw new SAXException(e); } @@ -123,7 +123,7 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe super.endDocument(contentHandler, metadata); addContent(contentHandler, metadata); try { - metadataFilter.filter(metadata); + metadataFilter.filter(List.of(metadata)); } catch (TikaException e) { throw new SAXException(e); } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java similarity index 91% rename from tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java rename to tika-core/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java index daa68c928..e0e0c72f3 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java @@ -14,14 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.metadata.listfilter; +package org.apache.tika.metadata.filter; import java.util.List; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -public class AttachmentCountingListFilter extends MetadataListFilter { +public class AttachmentCountingListFilter extends MetadataFilter { @Override public List<Metadata> filter(List<Metadata> metadataList) throws TikaException { if (metadataList == null || metadataList.isEmpty()) { diff --git a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/MetadataListFilterTest.java similarity index 86% rename from tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java rename to tika-core/src/test/java/org/apache/tika/metadata/filter/MetadataListFilterTest.java index ad5aa1a19..653fad323 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/MetadataListFilterTest.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.metadata.listfilter; +package org.apache.tika.metadata.filter; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -31,10 +31,10 @@ public class MetadataListFilterTest { public void testBasic() throws Exception { TikaConfig tikaConfig; try (InputStream is = MetadataListFilterTest.class.getResourceAsStream( - "metadatalistfilter-config.xml")) { + "metadatafilter-config.xml")) { tikaConfig = new TikaConfig(is); } - CompositeMetadataListFilter compositeMetadataListFilter = (CompositeMetadataListFilter) tikaConfig.getMetadataListFilter(); + CompositeMetadataFilter compositeMetadataListFilter = (CompositeMetadataFilter) tikaConfig.getMetadataFilter(); assertEquals(1, compositeMetadataListFilter.getFilters().size()); assertTrue(compositeMetadataListFilter.getFilters().get(0) instanceof AttachmentCountingListFilter); } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java index ac64734c2..517fb120c 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java @@ -18,16 +18,15 @@ package org.apache.tika.metadata.filter; import java.util.Locale; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; /** * Mock Filter for testing uppercasing of all values */ -public class MockUpperCaseFilter extends MetadataFilter { +public class MockUpperCaseFilter extends MetadataFilterBase { @Override - public void filter(Metadata metadata) throws TikaException { + protected void filter(Metadata metadata) { for (String n : metadata.names()) { String[] vals = metadata.getValues(n); metadata.remove(n); diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java index 91e4bd3be..51a940285 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java @@ -23,12 +23,14 @@ import static org.junit.jupiter.api.Assertions.assertNull; import java.util.Arrays; import java.util.HashSet; +import java.util.List; import java.util.Set; import org.junit.jupiter.api.Test; import org.apache.tika.config.AbstractTikaConfigTest; import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -46,7 +48,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("author", "author"); MetadataFilter defaultFilter = new DefaultMetadataFilter(); - defaultFilter.filter(metadata); + metadata = filterOne(defaultFilter, metadata); assertEquals(2, metadata.names().length); assertEquals("title", metadata.get("title")); @@ -60,7 +62,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("author", "author"); MetadataFilter filter = new IncludeFieldMetadataFilter(set("title")); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(1, metadata.names().length); assertEquals("title", metadata.get("title")); assertNull(metadata.get("author")); @@ -73,7 +75,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("author", "author"); MetadataFilter filter = new ExcludeFieldMetadataFilter(set("title")); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(1, metadata.names().length); assertEquals("author", metadata.get("author")); assertNull(metadata.get("title")); @@ -87,7 +89,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("author", "author"); metadata.set("content", "content"); - config.getMetadataFilter().filter(metadata); + metadata = filterOne(config.getMetadataFilter(), metadata); assertEquals(2, metadata.size()); assertEquals("title", metadata.get("title")); @@ -102,8 +104,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("author", "author"); metadata.set("content", "content"); - config.getMetadataFilter().filter(metadata); - + metadata = filterOne(config.getMetadataFilter(), metadata); assertEquals(1, metadata.size()); assertEquals("content", metadata.get("content")); } @@ -119,7 +120,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("author", "author"); metadata.set("content", "content"); - config.getMetadataFilter().filter(metadata); + metadata = filterOne(config.getMetadataFilter(), metadata); assertEquals(2, metadata.size()); assertArrayEquals(expectedTitles, metadata.getValues("title")); @@ -133,12 +134,12 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("author", "author"); MetadataFilter filter = new ClearByMimeMetadataFilter(set("image/jpeg", "application/pdf")); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(0, metadata.size()); metadata.set(Metadata.CONTENT_TYPE, MediaType.text("plain").toString()); metadata.set("author", "author"); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(2, metadata.size()); assertEquals("author", metadata.get("author")); @@ -153,12 +154,12 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("author", "author"); MetadataFilter filter = config.getMetadataFilter(); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(0, metadata.size()); metadata.set(Metadata.CONTENT_TYPE, MediaType.text("plain").toString()); metadata.set("author", "author"); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(2, metadata.size()); assertEquals("AUTHOR", metadata.get("author")); } @@ -173,7 +174,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set("a", "a-value"); MetadataFilter filter = config.getMetadataFilter(); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals("quick brown fox", metadata.get("content")); assertEquals("a-value", metadata.get("b")); assertNull(metadata.get("author")); @@ -201,7 +202,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); MetadataFilter filter = config.getMetadataFilter(); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT)); assertEquals("text/html", metadata.get("mime")); } @@ -215,7 +216,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set(Metadata.CONTENT_TYPE, "text/html"); MetadataFilter filter = config.getMetadataFilter(); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT)); assertEquals("text/html", metadata.get("mime")); } @@ -229,7 +230,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); MetadataFilter filter = config.getMetadataFilter(); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT)); assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE)); @@ -239,7 +240,7 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "text/plain; charset=UTF-8"); metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "application/pdf; charset=UTF-8"); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE)); } @@ -252,15 +253,18 @@ public class TestMetadataFilter extends AbstractTikaConfigTest { metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); MetadataFilter filter = config.getMetadataFilter(); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(0, metadata.names().length); metadata = new Metadata(); metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK .name()); metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); - filter.filter(metadata); + metadata = filterOne(filter, metadata); assertEquals(2, metadata.names().length); } + private static Metadata filterOne(MetadataFilter filter, Metadata singleMetadata) throws TikaException { + return filter.filter(List.of(singleMetadata)).get(0); + } } diff --git a/tika-core/src/test/resources/org/apache/tika/metadata/listfilter/metadatalistfilter-config.xml b/tika-core/src/test/resources/org/apache/tika/metadata/filter/metadatafilter-config.xml similarity index 83% rename from tika-core/src/test/resources/org/apache/tika/metadata/listfilter/metadatalistfilter-config.xml rename to tika-core/src/test/resources/org/apache/tika/metadata/filter/metadatafilter-config.xml index 55d25e58b..122a3696c 100644 --- a/tika-core/src/test/resources/org/apache/tika/metadata/listfilter/metadatalistfilter-config.xml +++ b/tika-core/src/test/resources/org/apache/tika/metadata/filter/metadatafilter-config.xml @@ -16,7 +16,7 @@ limitations under the License. --> <properties> - <metadataListFilters> - <metadataListFilter class="org.apache.tika.metadata.listfilter.AttachmentCountingListFilter"/> - </metadataListFilters> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.AttachmentCountingListFilter"/> + </metadataFilters> </properties> \ No newline at end of file diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java index 811958af4..5e788d959 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java @@ -29,14 +29,13 @@ import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator; import org.apache.tika.eval.core.textstats.TextStatsCalculator; import org.apache.tika.eval.core.tokens.CommonTokenResult; import org.apache.tika.eval.core.tokens.TokenCounts; -import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.filter.MetadataFilterBase; -public class TikaEvalMetadataFilter extends MetadataFilter { +public class TikaEvalMetadataFilter extends MetadataFilterBase { public static String TIKA_EVAL_NS = "tika-eval" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; @@ -73,7 +72,7 @@ public class TikaEvalMetadataFilter extends MetadataFilter { @Override - public void filter(Metadata metadata) throws TikaException { + public void filter(Metadata metadata) { String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); if (StringUtils.isAllBlank(content)) { return; diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java index f1fd21c21..1374ac0eb 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java @@ -18,6 +18,8 @@ package org.apache.tika.eval.core.metadata; import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.List; + import org.junit.jupiter.api.Test; import org.apache.tika.metadata.Metadata; @@ -36,7 +38,7 @@ public class TikaEvalMetadataFilterTest { String content = "the quick brown fox, Zothro 1234 1235, jumped over the lazy dog"; metadata.set(TikaCoreProperties.TIKA_CONTENT, content); - filter.filter(metadata); + metadata = filter.filter(List.of(metadata)).get(0); assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE)); assertEquals(12, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_TOKENS)); assertEquals(11, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS)); diff --git a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java index e0f88023e..b7becc88d 100644 --- a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java +++ b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java @@ -17,14 +17,13 @@ package org.apache.tika.langdetect.opennlp.metadatafilter; import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaException; import org.apache.tika.langdetect.opennlp.OpenNLPDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.filter.MetadataFilterBase; -public class OpenNLPMetadataFilter extends MetadataFilter { +public class OpenNLPMetadataFilter extends MetadataFilterBase { private int maxCharsForDetection = 10000; @@ -34,7 +33,7 @@ public class OpenNLPMetadataFilter extends MetadataFilter { } @Override - public void filter(Metadata metadata) throws TikaException { + public void filter(Metadata metadata) { OpenNLPDetector detector = new OpenNLPDetector(); detector.setMaxLength(maxCharsForDetection); String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); diff --git a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java index a0e3dd6c7..cb850d2de 100644 --- a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java +++ b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java @@ -17,14 +17,13 @@ package org.apache.tika.langdetect.optimaize.metadatafilter; import org.apache.tika.config.Field; -import org.apache.tika.exception.TikaException; import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.filter.MetadataFilterBase; -public class OptimaizeMetadataFilter extends MetadataFilter { +public class OptimaizeMetadataFilter extends MetadataFilterBase { private int maxCharsForDetection = OptimaizeLangDetector.DEFAULT_MAX_CHARS_FOR_DETECTION; @@ -34,7 +33,7 @@ public class OptimaizeMetadataFilter extends MetadataFilter { } @Override - public void filter(Metadata metadata) throws TikaException { + public void filter(Metadata metadata) { OptimaizeLangDetector detector = new OptimaizeLangDetector(maxCharsForDetection); detector.loadModels(); String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java index af7150354..5fa033929 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java @@ -18,11 +18,11 @@ package org.apache.tika.pipes.core; import java.io.Serializable; -import org.apache.tika.metadata.listfilter.MetadataListFilter; +import org.apache.tika.metadata.filter.MetadataFilter; /** * Filter/Select some of the emitted output and pass it back to the client parser. */ -public abstract class PassbackFilter extends MetadataListFilter implements Serializable { +public abstract class PassbackFilter extends MetadataFilter implements Serializable { } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java index 40dab9cfe..1d0e83b9b 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PipesServer.java @@ -60,8 +60,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.filter.MetadataFilter; -import org.apache.tika.metadata.listfilter.MetadataListFilter; -import org.apache.tika.metadata.listfilter.NoOpListFilter; +import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DigestingParser; @@ -482,8 +481,7 @@ public class PipesServer implements Runnable { long start = System.currentTimeMillis(); String stack = getContainerStacktrace(t, parseData.getMetadataList()); //we need to apply the metadata filter after we pull out the stacktrace - filterMetadata(t, parseData.getMetadataList()); - filterMetadataList(t, parseData); + filterMetadata(t, parseData); ParseContext parseContext = t.getParseContext(); FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException(); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); @@ -527,26 +525,12 @@ public class PipesServer implements Runnable { return false; } - private void filterMetadata(FetchEmitTuple t, List<Metadata> metadataList) { + private void filterMetadata(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) { MetadataFilter filter = t.getParseContext().get(MetadataFilter.class); if (filter == null) { filter = tikaConfig.getMetadataFilter(); } - for (Metadata m : metadataList) { - try { - filter.filter(m); - } catch (TikaException e) { - LOG.warn("failed to filter metadata", e); - } - } - } - - private void filterMetadataList(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) { - MetadataListFilter filter = t.getParseContext().get(MetadataListFilter.class); - if (filter == null) { - filter = tikaConfig.getMetadataListFilter(); - } - if (filter instanceof NoOpListFilter) { + if (filter instanceof NoOpFilter) { return; } try { @@ -956,7 +940,7 @@ public class PipesServer implements Runnable { return metadataList; } - public void filter(MetadataListFilter filter) throws TikaException { + public void filter(MetadataFilter filter) throws TikaException { metadataList = filter.filter(metadataList); } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java index 04c466c1d..9cc33478f 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java @@ -29,12 +29,10 @@ import org.junit.jupiter.api.io.TempDir; import org.apache.tika.config.TikaTaskTimeout; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.filter.AttachmentCountingListFilter; import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.MockUpperCaseFilter; -import org.apache.tika.metadata.listfilter.AttachmentCountingListFilter; -import org.apache.tika.metadata.listfilter.CompositeMetadataListFilter; -import org.apache.tika.metadata.listfilter.MetadataListFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.pipes.api.PipesResult; @@ -88,8 +86,8 @@ public class PipesClientTest { @Test public void testMetadataListFilter(@TempDir Path tmp) throws Exception { ParseContext parseContext = new ParseContext(); - MetadataListFilter metadataFilter = new CompositeMetadataListFilter(List.of(new AttachmentCountingListFilter())); - parseContext.set(MetadataListFilter.class, metadataFilter); + MetadataFilter metadataFilter = new CompositeMetadataFilter(List.of(new AttachmentCountingListFilter())); + parseContext.set(MetadataFilter.class, metadataFilter); String testFile = "mock-embedded.xml"; @@ -111,8 +109,8 @@ public class PipesClientTest { //I did both manually during development, but unit tests are better. :D ParseContext parseContext = new ParseContext(); parseContext.set(TikaTaskTimeout.class, new TikaTaskTimeout(1000)); - MetadataListFilter metadataFilter = new CompositeMetadataListFilter(List.of(new AttachmentCountingListFilter())); - parseContext.set(MetadataListFilter.class, metadataFilter); + MetadataFilter metadataFilter = new CompositeMetadataFilter(List.of(new AttachmentCountingListFilter())); + parseContext.set(MetadataFilter.class, metadataFilter); String testFile = "mock-timeout-10s.xml"; PipesClient pipesClient = init(tmp, testFile); diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java index a0fba59e9..d3090a36f 100644 --- a/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java @@ -19,6 +19,7 @@ package org.apache.tika.pipes.emitter.solr; import java.util.Collections; import java.util.Date; import java.util.HashMap; +import java.util.List; import java.util.Map; import com.fasterxml.jackson.databind.ObjectMapper; @@ -66,7 +67,7 @@ public class SolrEmitterDevTest { mappings.put(TikaCoreProperties.CREATED.getName(), "created"); mappings.put(TikaCoreProperties.TIKA_CONTENT.getName(), "content"); filter.setMappings(mappings); - filter.filter(metadata); + metadata = filter.filter(List.of(metadata)).get(0); solrEmitter.emit(emitKey, Collections.singletonList(metadata), new ParseContext()); } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 483596199..867481d45 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -76,7 +76,7 @@ public class TikaLoader { private Parser parsers; private Detector detectors; private EncodingDetector encodingDetectors; - private MetadataFilter metadataFilters; + private MetadataFilter metadataFilter; private Renderer renderers; private TikaLoader(TikaJsonConfig config, ClassLoader classLoader, @@ -193,14 +193,14 @@ public class TikaLoader { * @throws TikaConfigException if loading fails */ public synchronized MetadataFilter loadMetadataFilters() throws TikaConfigException { - if (metadataFilters == null) { + if (metadataFilter == null) { CompositeComponentLoader<MetadataFilter> loader = new CompositeComponentLoader<>( MetadataFilter.class, "metadataFilters", "metadata-filters", classLoader, objectMapper); List<MetadataFilter> filterList = loader.loadFromArray(config); - metadataFilters = new CompositeMetadataFilter(filterList); + metadataFilter = new CompositeMetadataFilter(filterList); } - return metadataFilters; + return metadataFilter; } /** diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index e0ba6279a..f620f7694 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -41,7 +41,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.listfilter.MetadataListFilter; +import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; @@ -87,8 +87,10 @@ public class RecursiveMetadataResource { //we shouldn't get here? LOG.error("something went seriously wrong", e); } - MetadataListFilter metadataListFilter = context.get(MetadataListFilter.class, getConfig().getMetadataListFilter()); - return metadataListFilter.filter(handler.getMetadataList()); + MetadataFilter metadataFilter = context.get(MetadataFilter.class, getConfig().getMetadataFilter()); + //note that the filter may modify the contents of handler's metadata list. + //do a deep copy if that's problematic. + return metadataFilter.filter(handler.getMetadataList()); } static HandlerConfig buildHandlerConfig(MultivaluedMap<String, String> httpHeaders, String handlerTypeName, HandlerConfig.PARSE_MODE parseMode) { diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 057c797b2..fcbb24c87 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -551,11 +551,14 @@ public class TikaResource { throws IOException, TikaException { Metadata metadata = new Metadata(); parseToMetadata(getInputStream(att.getObject(InputStream.class), metadata, httpHeaders, info), metadata, preparePostHeaderMap(att, httpHeaders), info, handlerTypeName); - TikaResource + List<Metadata> ret = TikaResource .getConfig() .getMetadataFilter() - .filter(metadata); - return metadata; + .filter(List.of(metadata)); + if (ret == null || ret.isEmpty()) { + return new Metadata(); + } + return ret.get(0); } @PUT @@ -566,10 +569,13 @@ public class TikaResource { throws IOException, TikaException { Metadata metadata = new Metadata(); parseToMetadata(getInputStream(is, metadata, httpHeaders, info), metadata, httpHeaders.getRequestHeaders(), info, handlerTypeName); - TikaResource + List<Metadata> ret = TikaResource .getConfig() .getMetadataFilter() - .filter(metadata); + .filter(List.of(metadata)); + if (ret == null || ret.isEmpty()) { + return new Metadata(); + } return metadata; }
