This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch multiple-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit ee60f5e8ac4002cb6a296adc24cbcb7183cb1f8e Author: Nick Burch <n...@gagravarr.org> AuthorDate: Tue Mar 13 17:43:30 2018 +0000 Implement some metadata policies for merging values from multiple parsers --- .../parser/multiple/AbstractMultipleParser.java | 48 ++++++++++++++++++---- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java index 6262dc1..9781f49 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java @@ -16,9 +16,12 @@ */ package org.apache.tika.parser.multiple; +import static org.apache.tika.utils.ParserUtils.cloneMetadata; +import static org.apache.tika.utils.ParserUtils.recordParserDetails; +import static org.apache.tika.utils.ParserUtils.recordParserFailure; + import java.io.IOException; import java.io.InputStream; -import java.nio.file.Path; import java.util.Arrays; import java.util.HashSet; import java.util.List; @@ -34,7 +37,6 @@ import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; -import static org.apache.tika.utils.ParserUtils.*; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -187,7 +189,7 @@ public abstract class AbstractMultipleParser extends AbstractParser { // later if required for parser 2+ // TODO Should we use RereadableInputStream instead? TikaInputStream taggedStream = TikaInputStream.get(stream, tmp); - Path path = taggedStream.getPath(); + taggedStream.getPath(); // TODO Somehow shield/wrap the Handler, so that we can // avoid failures if multiple parsers want to do content @@ -202,8 +204,9 @@ public abstract class AbstractMultipleParser extends AbstractParser { // Record that we used this parser recordParserDetails(p, metadata); - - // TODO Handle metadata clashes based on the Policy + + // Prepare an near-empty Metadata, will merge after + metadata = cloneMetadata(originalMetadata); // Process if possible Exception failure = null; @@ -229,14 +232,45 @@ public abstract class AbstractMultipleParser extends AbstractParser { break; } - // TODO Handle metadata clashes based on the Policy - lastMetadata = cloneMetadata(metadata); + // Handle metadata merging / clashes + metadata = mergeMetadata(metadata, lastMetadata, policy); // Prepare for the next parser, if present + lastMetadata = cloneMetadata(metadata); taggedStream.reset(); } } finally { tmp.dispose(); } } + + // TODO Provide a method that takes an InputStreamSource as well, + // and a ContentHandlerFactory. Will need wrappers to convert standard + + protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata, MetadataPolicy policy) { + if (policy == MetadataPolicy.DISCARD_ALL) { + return newMetadata; + } + + for (String n : lastMetadata.names()) { + if (newMetadata.get(n) == null) { + newMetadata.set(n, lastMetadata.get(n)); + } else { + switch (policy) { + case FIRST_WINS: + // Use the earlier value + newMetadata.set(n, lastMetadata.get(n)); + continue; + case LAST_WINS: + // Most recent (last) parser has already won + continue; + case KEEP_ALL: + // TODO Find unique values to add + // TODO Implement + continue; + } + } + } + return newMetadata; + } } -- To stop receiving notification emails like this one, please contact n...@apache.org.