This is an automated email from the ASF dual-hosted git repository.

kwin pushed a commit to branch feature/tika-3.2.3
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git

commit c2db2fc9efae869282a9272f4a3fb69623f96f8f
Author: Konrad Windszus <[email protected]>
AuthorDate: Tue Dec 16 21:21:57 2025 +0100

    OAK-9752 Migrate to Tika 3.2.3
---
 oak-lucene/pom.xml                                 |  2 +-
 oak-parent/pom.xml                                 |  2 +-
 .../index/search/spi/binary/TikaParserConfig.java  | 74 ++++++++--------------
 3 files changed, 30 insertions(+), 48 deletions(-)

diff --git a/oak-lucene/pom.xml b/oak-lucene/pom.xml
index b5b411cdd1..b3adf7489d 100644
--- a/oak-lucene/pom.xml
+++ b/oak-lucene/pom.xml
@@ -389,7 +389,7 @@
     </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
-      <artifactId>tika-parsers</artifactId>
+      <artifactId>tika-parsers-standard-package</artifactId>
       <version>${tika.version}</version>
       <scope>test</scope>
       <exclusions>
diff --git a/oak-parent/pom.xml b/oak-parent/pom.xml
index e8bab71210..861a469025 100644
--- a/oak-parent/pom.xml
+++ b/oak-parent/pom.xml
@@ -62,7 +62,7 @@
     <slf4j.version>1.7.36</slf4j.version> <!-- sync with logback version -->
     <logback.version>1.2.13</logback.version>
     <h2.version>2.1.214</h2.version>
-    <tika.version>1.28.5</tika.version>
+    <tika.version>3.2.3</tika.version>
     <derby.version>10.15.2.0</derby.version>
     <jackson.version>2.17.3</jackson.version>
     <testcontainers.version>1.21.1</testcontainers.version>
diff --git 
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
 
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
index 447cc0a582..a70ae94823 100644
--- 
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
+++ 
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
@@ -22,22 +22,22 @@ package 
org.apache.jackrabbit.oak.plugins.index.search.spi.binary;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashSet;
+import java.util.Optional;
 import java.util.Set;
 
-import javax.xml.parsers.DocumentBuilder;
-
-import org.apache.jackrabbit.oak.commons.StringUtils;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;
 
 public class TikaParserConfig {
 
-    private static final String EMPTY_PARSER = 
"org.apache.tika.parser.EmptyParser";
+    private static final Logger log = 
LoggerFactory.getLogger(TikaParserConfig.class);
 
     /**
      * Determines the set of MediaType which have been configured with an 
EmptyParser.
@@ -48,50 +48,32 @@ public class TikaParserConfig {
     public static Set<MediaType> getNonIndexedMediaTypes(InputStream 
configStream) throws
             TikaException, IOException, SAXException {
         Set<MediaType> result = new HashSet<>();
-        Element element = 
getBuilder().parse(configStream).getDocumentElement();
-        NodeList nodes = element.getElementsByTagName("parsers");
-        if (nodes.getLength() == 1) {
-            Node parentNode = nodes.item(0);
-            NodeList parsersNodes = parentNode.getChildNodes();
-            for (int i = 0; i < parsersNodes.getLength(); i++) {
-                Node node = parsersNodes.item(i);
-                if (node instanceof Element) {
-                    String className = ((Element) node).getAttribute("class");
-                    if (EMPTY_PARSER.equals(className)) {
-                        NodeList mimes = ((Element) 
node).getElementsByTagName("mime");
-                        parseMimeTypes(result, mimes);
-                    }
-                }
+        TikaConfig config = new TikaConfig(configStream);
+        if (config.getParser() instanceof 
org.apache.tika.parser.CompositeParser) {
+            // pick the (decorated) empty parser
+            Optional<Parser> emptyParser = 
((org.apache.tika.parser.CompositeParser) 
config.getParser()).getAllComponentParsers().stream()
+                    .filter(p -> isEmptyParser(p))
+                    .findFirst();
+            if (emptyParser.isPresent()) {
+                emptyParser.get().getSupportedTypes(new 
ParseContext()).forEach(result::add);
             }
+        } else {
+            log.debug("Tika CompositeParser not used, no empty parsers 
configured via custom tika config");
         }
         return result;
     }
 
-
-    private static void parseMimeTypes(Set<MediaType> result, NodeList mimes) {
-        /*
-        <parser class="org.apache.tika.parser.EmptyParser">
-            <mime>application/x-archive</mime>
-            <mime>application/x-bzip</mime>
-            <mime>application/x-bzip2</mime>
-        </parser>
-        */
-        for (int j = 0; j < mimes.getLength(); j++) {
-            Node mime = mimes.item(j);
-            if (mime instanceof Element) {
-                String mimeValue = mime.getTextContent();
-                mimeValue = StringUtils.emptyToNull(mimeValue);
-                if (mimeValue != null) {
-                    MediaType mediaType = MediaType.parse(mimeValue.trim());
-                    if (mediaType != null) {
-                        result.add(mediaType);
-                    }
-                }
-            }
+    /**
+     * Returns true if the given parser is an EmptyParser or decorates an 
EmptyParser.
+     * @param parser
+     * @return {@code true} if the given parser is an EmptyParser or decorates 
an EmptyParser
+     */
+    private static boolean isEmptyParser(Parser parser) {
+        if (parser instanceof org.apache.tika.parser.EmptyParser) {
+            return true;
+        } else if (parser instanceof org.apache.tika.parser.ParserDecorator) {
+            return isEmptyParser(((ParserDecorator) 
parser).getWrappedParser());
         }
-    }
-
-    private static DocumentBuilder getBuilder() throws TikaException {
-        return new ParseContext().getDocumentBuilder();
+        return false;
     }
 }

Reply via email to