This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new e7161b7ca TIKA-4545 - clean up media registry (#2411)
e7161b7ca is described below

commit e7161b7cacd36e045b36e138c67f1da0c3fedeec
Author: Tim Allison <[email protected]>
AuthorDate: Fri Nov 28 09:51:18 2025 -0500

    TIKA-4545 - clean up media registry (#2411)
---
 .../apache/tika/config/loader/ParserLoader.java    | 41 +++++++++++++++------
 .../org/apache/tika/config/loader/TikaLoader.java  | 43 +++++++++-------------
 2 files changed, 48 insertions(+), 36 deletions(-)

diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
index 5cfc3cd11..1ff3fccd1 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
@@ -33,9 +33,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.AbstractEncodingDetectorParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
@@ -52,7 +53,7 @@ public class ParserLoader {
 
     private final ClassLoader classLoader;
     private final ObjectMapper objectMapper;
-    private final MediaTypeRegistry mediaTypeRegistry;
+    private final EncodingDetector encodingDetector;
 
     /**
      * Holds parsed config data before decoration is applied.
@@ -71,10 +72,10 @@ public class ParserLoader {
     }
 
     public ParserLoader(ClassLoader classLoader, ObjectMapper objectMapper,
-                        MediaTypeRegistry mediaTypeRegistry) {
+                        EncodingDetector encodingDetector) {
         this.classLoader = classLoader;
         this.objectMapper = objectMapper;
-        this.mediaTypeRegistry = mediaTypeRegistry;
+        this.encodingDetector = encodingDetector;
     }
 
     /**
@@ -85,6 +86,8 @@ public class ParserLoader {
      * @throws TikaConfigException if loading fails
      */
     public CompositeParser load(TikaJsonConfig config) throws 
TikaConfigException {
+        //TODO -- need to handle multiparsers at some point
+        //TODO -- add special handling for external parsers?
         List<Parser> parserList = new ArrayList<>();
 
         // Load configured parsers
@@ -185,7 +188,7 @@ public class ParserLoader {
             parserList.addAll(spiParsers);
         }
 
-        return new CompositeParser(mediaTypeRegistry, parserList);
+        return new CompositeParser(TikaLoader.getMediaTypeRegistry(), 
parserList);
     }
 
     private ParsedParserConfig loadConfiguredParser(String name, JsonNode 
configNode,
@@ -213,16 +216,32 @@ public class ParserLoader {
             throws TikaConfigException {
 
         try {
+            Parser parser;
+
             // Try constructor with String parameter (JSON config)
             try {
+                //TODO -- change this from String to JsonConfig or simple 
wrapper class
                 Constructor<?> constructor = 
parserClass.getConstructor(String.class);
-                return (Parser) constructor.newInstance(configJson);
+                parser = (Parser) constructor.newInstance(configJson);
             } catch (NoSuchMethodException e) {
-                // TODO -- entrypoint for actual configuration
-                // Fall back to zero-arg constructor
-                return (Parser) ServiceLoaderUtils.newInstance(parserClass,
-                        new org.apache.tika.config.ServiceLoader(classLoader));
+                // Try constructor with EncodingDetector parameter (for 
AbstractEncodingDetectorParser)
+                if 
(AbstractEncodingDetectorParser.class.isAssignableFrom(parserClass)) {
+                    try {
+                        Constructor<?> constructor = 
parserClass.getConstructor(EncodingDetector.class);
+                        parser = (Parser) 
constructor.newInstance(encodingDetector);
+                    } catch (NoSuchMethodException ex) {
+                        // Fall back to zero-arg constructor
+                        parser = (Parser) 
ServiceLoaderUtils.newInstance(parserClass,
+                                new 
org.apache.tika.config.ServiceLoader(classLoader));
+                    }
+                } else {
+                    // Fall back to zero-arg constructor
+                    parser = (Parser) 
ServiceLoaderUtils.newInstance(parserClass,
+                            new 
org.apache.tika.config.ServiceLoader(classLoader));
+                }
             }
+
+            return parser;
         } catch (InstantiationException | IllegalAccessException | 
InvocationTargetException e) {
             throw new TikaConfigException("Failed to instantiate parser: " +
                     parserClass.getName(), e);
@@ -268,7 +287,7 @@ public class ParserLoader {
             fallbackParsers.add(fallbackConfig.parser);
         }
 
-        return new FallbackParser(mediaTypeRegistry, MetadataPolicy.KEEP_ALL, 
fallbackParsers);
+        return new FallbackParser(TikaLoader.getMediaTypeRegistry(), 
MetadataPolicy.KEEP_ALL, fallbackParsers);
     }
 
     private List<Parser> loadSpiParsers(Set<Class<?>> excludeClasses) {
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 2432ed28a..aeeaf216f 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -74,9 +74,9 @@ public class TikaLoader {
     private final TikaJsonConfig config;
     private final ClassLoader classLoader;
     private final ObjectMapper objectMapper;
-    private final MediaTypeRegistry mediaTypeRegistry;
 
     // Cached instances (lazy loaded)
+    private static MediaTypeRegistry mediaTypeRegistry;
     private Parser parsers;
     private Detector detectors;
     private EncodingDetector encodingDetectors;
@@ -84,12 +84,10 @@ public class TikaLoader {
     private Renderer renderers;
     private ConfigLoader configLoader;
 
-    private TikaLoader(TikaJsonConfig config, ClassLoader classLoader,
-                       MediaTypeRegistry mediaTypeRegistry) {
+    private TikaLoader(TikaJsonConfig config, ClassLoader classLoader) {
         this.config = config;
         this.classLoader = classLoader;
         this.objectMapper = TikaJsonConfig.getObjectMapper();
-        this.mediaTypeRegistry = mediaTypeRegistry;
     }
 
     /**
@@ -114,36 +112,26 @@ public class TikaLoader {
     public static TikaLoader load(Path configPath, ClassLoader classLoader)
             throws TikaConfigException {
         TikaJsonConfig config = TikaJsonConfig.load(configPath);
-        MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry();
-        return new TikaLoader(config, classLoader, registry);
-    }
-
-    /**
-     * Loads a Tika configuration with custom media type registry.
-     *
-     * @param configPath the path to the JSON configuration file
-     * @param classLoader the class loader to use for loading components
-     * @param mediaTypeRegistry the media type registry to use
-     * @return the Tika loader
-     * @throws TikaConfigException if loading or parsing fails
-     */
-    public static TikaLoader load(Path configPath, ClassLoader classLoader,
-                                   MediaTypeRegistry mediaTypeRegistry)
-            throws TikaConfigException {
-        TikaJsonConfig config = TikaJsonConfig.load(configPath);
-        return new TikaLoader(config, classLoader, mediaTypeRegistry);
+        return new TikaLoader(config, classLoader);
     }
 
     /**
      * Loads and returns all parsers.
      * Results are cached - subsequent calls return the same instance.
+     * <p>
+     * Note: This method ensures EncodingDetectors are loaded first,
+     * as some parsers require them during construction (e.g., 
AbstractEncodingDetectorParser
+     * requires an EncodingDetector).
      *
      * @return the parser (typically a CompositeParser internally)
      * @throws TikaConfigException if loading fails
      */
     public synchronized Parser loadParsers() throws TikaConfigException {
         if (parsers == null) {
-            ParserLoader loader = new ParserLoader(classLoader, objectMapper, 
mediaTypeRegistry);
+            // Load EncodingDetectors first - some parsers need them during 
construction
+            EncodingDetector encodingDetector = loadEncodingDetectors();
+
+            ParserLoader loader = new ParserLoader(classLoader, objectMapper, 
encodingDetector);
             parsers = loader.load(config);
         }
         return parsers;
@@ -163,7 +151,7 @@ public class TikaLoader {
             CompositeComponentLoader<Detector> loader = new 
CompositeComponentLoader<>(
                     Detector.class, "detectors", "detectors", classLoader, 
objectMapper);
             List<Detector> detectorList = loader.loadFromArray(config);
-            detectors = new CompositeDetector(mediaTypeRegistry, detectorList);
+            detectors = new CompositeDetector(getMediaTypeRegistry(), 
detectorList);
         }
         return detectors;
     }
@@ -277,10 +265,15 @@ public class TikaLoader {
 
     /**
      * Gets the media type registry.
+     * Lazily loads the default registry if not already set.
+     * This is a static singleton shared across all TikaLoader instances.
      *
      * @return the media type registry
      */
-    public MediaTypeRegistry getMediaTypeRegistry() {
+    public static synchronized MediaTypeRegistry getMediaTypeRegistry() {
+        if (mediaTypeRegistry == null) {
+            mediaTypeRegistry = MediaTypeRegistry.getDefaultRegistry();
+        }
         return mediaTypeRegistry;
     }
 }

Reply via email to