This is an automated email from the ASF dual-hosted git repository. kwin pushed a commit to branch feature/tika-3.2.3 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit c2db2fc9efae869282a9272f4a3fb69623f96f8f Author: Konrad Windszus <[email protected]> AuthorDate: Tue Dec 16 21:21:57 2025 +0100 OAK-9752 Migrate to Tika 3.2.3 --- oak-lucene/pom.xml | 2 +- oak-parent/pom.xml | 2 +- .../index/search/spi/binary/TikaParserConfig.java | 74 ++++++++-------------- 3 files changed, 30 insertions(+), 48 deletions(-) diff --git a/oak-lucene/pom.xml b/oak-lucene/pom.xml index b5b411cdd1..b3adf7489d 100644 --- a/oak-lucene/pom.xml +++ b/oak-lucene/pom.xml @@ -389,7 +389,7 @@ </dependency> <dependency> <groupId>org.apache.tika</groupId> - <artifactId>tika-parsers</artifactId> + <artifactId>tika-parsers-standard-package</artifactId> <version>${tika.version}</version> <scope>test</scope> <exclusions> diff --git a/oak-parent/pom.xml b/oak-parent/pom.xml index e8bab71210..861a469025 100644 --- a/oak-parent/pom.xml +++ b/oak-parent/pom.xml @@ -62,7 +62,7 @@ <slf4j.version>1.7.36</slf4j.version> <!-- sync with logback version --> <logback.version>1.2.13</logback.version> <h2.version>2.1.214</h2.version> - <tika.version>1.28.5</tika.version> + <tika.version>3.2.3</tika.version> <derby.version>10.15.2.0</derby.version> <jackson.version>2.17.3</jackson.version> <testcontainers.version>1.21.1</testcontainers.version> diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java index 447cc0a582..a70ae94823 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java @@ -22,22 +22,22 @@ package org.apache.jackrabbit.oak.plugins.index.search.spi.binary; import java.io.IOException; import java.io.InputStream; import java.util.HashSet; +import java.util.Optional; import java.util.Set; -import javax.xml.parsers.DocumentBuilder; - -import org.apache.jackrabbit.oak.commons.StringUtils; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; public class TikaParserConfig { - private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser"; + private static final Logger log = LoggerFactory.getLogger(TikaParserConfig.class); /** * Determines the set of MediaType which have been configured with an EmptyParser. @@ -48,50 +48,32 @@ public class TikaParserConfig { public static Set<MediaType> getNonIndexedMediaTypes(InputStream configStream) throws TikaException, IOException, SAXException { Set<MediaType> result = new HashSet<>(); - Element element = getBuilder().parse(configStream).getDocumentElement(); - NodeList nodes = element.getElementsByTagName("parsers"); - if (nodes.getLength() == 1) { - Node parentNode = nodes.item(0); - NodeList parsersNodes = parentNode.getChildNodes(); - for (int i = 0; i < parsersNodes.getLength(); i++) { - Node node = parsersNodes.item(i); - if (node instanceof Element) { - String className = ((Element) node).getAttribute("class"); - if (EMPTY_PARSER.equals(className)) { - NodeList mimes = ((Element) node).getElementsByTagName("mime"); - parseMimeTypes(result, mimes); - } - } + TikaConfig config = new TikaConfig(configStream); + if (config.getParser() instanceof org.apache.tika.parser.CompositeParser) { + // pick the (decorated) empty parser + Optional<Parser> emptyParser = ((org.apache.tika.parser.CompositeParser) config.getParser()).getAllComponentParsers().stream() + .filter(p -> isEmptyParser(p)) + .findFirst(); + if (emptyParser.isPresent()) { + emptyParser.get().getSupportedTypes(new ParseContext()).forEach(result::add); } + } else { + log.debug("Tika CompositeParser not used, no empty parsers configured via custom tika config"); } return result; } - - private static void parseMimeTypes(Set<MediaType> result, NodeList mimes) { - /* - <parser class="org.apache.tika.parser.EmptyParser"> - <mime>application/x-archive</mime> - <mime>application/x-bzip</mime> - <mime>application/x-bzip2</mime> - </parser> - */ - for (int j = 0; j < mimes.getLength(); j++) { - Node mime = mimes.item(j); - if (mime instanceof Element) { - String mimeValue = mime.getTextContent(); - mimeValue = StringUtils.emptyToNull(mimeValue); - if (mimeValue != null) { - MediaType mediaType = MediaType.parse(mimeValue.trim()); - if (mediaType != null) { - result.add(mediaType); - } - } - } + /** + * Returns true if the given parser is an EmptyParser or decorates an EmptyParser. + * @param parser + * @return {@code true} if the given parser is an EmptyParser or decorates an EmptyParser + */ + private static boolean isEmptyParser(Parser parser) { + if (parser instanceof org.apache.tika.parser.EmptyParser) { + return true; + } else if (parser instanceof org.apache.tika.parser.ParserDecorator) { + return isEmptyParser(((ParserDecorator) parser).getWrappedParser()); } - } - - private static DocumentBuilder getBuilder() throws TikaException { - return new ParseContext().getDocumentBuilder(); + return false; } }
