This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4545=cleanup-mediaregistry in repository https://gitbox.apache.org/repos/asf/tika.git
commit d8b5296f29e92031969395833c4f310e74be010a Author: tallison <[email protected]> AuthorDate: Fri Nov 28 08:39:30 2025 -0500 TIKA-4545 - clean up media registry --- .../apache/tika/config/loader/ParserLoader.java | 41 +++++++++++++++------ .../org/apache/tika/config/loader/TikaLoader.java | 43 +++++++++------------- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java index 5cfc3cd11..1ff3fccd1 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java @@ -33,9 +33,10 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MediaTypeRegistry; +import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; @@ -52,7 +53,7 @@ public class ParserLoader { private final ClassLoader classLoader; private final ObjectMapper objectMapper; - private final MediaTypeRegistry mediaTypeRegistry; + private final EncodingDetector encodingDetector; /** * Holds parsed config data before decoration is applied. @@ -71,10 +72,10 @@ public class ParserLoader { } public ParserLoader(ClassLoader classLoader, ObjectMapper objectMapper, - MediaTypeRegistry mediaTypeRegistry) { + EncodingDetector encodingDetector) { this.classLoader = classLoader; this.objectMapper = objectMapper; - this.mediaTypeRegistry = mediaTypeRegistry; + this.encodingDetector = encodingDetector; } /** @@ -85,6 +86,8 @@ public class ParserLoader { * @throws TikaConfigException if loading fails */ public CompositeParser load(TikaJsonConfig config) throws TikaConfigException { + //TODO -- need to handle multiparsers at some point + //TODO -- add special handling for external parsers? List<Parser> parserList = new ArrayList<>(); // Load configured parsers @@ -185,7 +188,7 @@ public class ParserLoader { parserList.addAll(spiParsers); } - return new CompositeParser(mediaTypeRegistry, parserList); + return new CompositeParser(TikaLoader.getMediaTypeRegistry(), parserList); } private ParsedParserConfig loadConfiguredParser(String name, JsonNode configNode, @@ -213,16 +216,32 @@ public class ParserLoader { throws TikaConfigException { try { + Parser parser; + // Try constructor with String parameter (JSON config) try { + //TODO -- change this from String to JsonConfig or simple wrapper class Constructor<?> constructor = parserClass.getConstructor(String.class); - return (Parser) constructor.newInstance(configJson); + parser = (Parser) constructor.newInstance(configJson); } catch (NoSuchMethodException e) { - // TODO -- entrypoint for actual configuration - // Fall back to zero-arg constructor - return (Parser) ServiceLoaderUtils.newInstance(parserClass, - new org.apache.tika.config.ServiceLoader(classLoader)); + // Try constructor with EncodingDetector parameter (for AbstractEncodingDetectorParser) + if (AbstractEncodingDetectorParser.class.isAssignableFrom(parserClass)) { + try { + Constructor<?> constructor = parserClass.getConstructor(EncodingDetector.class); + parser = (Parser) constructor.newInstance(encodingDetector); + } catch (NoSuchMethodException ex) { + // Fall back to zero-arg constructor + parser = (Parser) ServiceLoaderUtils.newInstance(parserClass, + new org.apache.tika.config.ServiceLoader(classLoader)); + } + } else { + // Fall back to zero-arg constructor + parser = (Parser) ServiceLoaderUtils.newInstance(parserClass, + new org.apache.tika.config.ServiceLoader(classLoader)); + } } + + return parser; } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) { throw new TikaConfigException("Failed to instantiate parser: " + parserClass.getName(), e); @@ -268,7 +287,7 @@ public class ParserLoader { fallbackParsers.add(fallbackConfig.parser); } - return new FallbackParser(mediaTypeRegistry, MetadataPolicy.KEEP_ALL, fallbackParsers); + return new FallbackParser(TikaLoader.getMediaTypeRegistry(), MetadataPolicy.KEEP_ALL, fallbackParsers); } private List<Parser> loadSpiParsers(Set<Class<?>> excludeClasses) { diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 2432ed28a..aeeaf216f 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -74,9 +74,9 @@ public class TikaLoader { private final TikaJsonConfig config; private final ClassLoader classLoader; private final ObjectMapper objectMapper; - private final MediaTypeRegistry mediaTypeRegistry; // Cached instances (lazy loaded) + private static MediaTypeRegistry mediaTypeRegistry; private Parser parsers; private Detector detectors; private EncodingDetector encodingDetectors; @@ -84,12 +84,10 @@ public class TikaLoader { private Renderer renderers; private ConfigLoader configLoader; - private TikaLoader(TikaJsonConfig config, ClassLoader classLoader, - MediaTypeRegistry mediaTypeRegistry) { + private TikaLoader(TikaJsonConfig config, ClassLoader classLoader) { this.config = config; this.classLoader = classLoader; this.objectMapper = TikaJsonConfig.getObjectMapper(); - this.mediaTypeRegistry = mediaTypeRegistry; } /** @@ -114,36 +112,26 @@ public class TikaLoader { public static TikaLoader load(Path configPath, ClassLoader classLoader) throws TikaConfigException { TikaJsonConfig config = TikaJsonConfig.load(configPath); - MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry(); - return new TikaLoader(config, classLoader, registry); - } - - /** - * Loads a Tika configuration with custom media type registry. - * - * @param configPath the path to the JSON configuration file - * @param classLoader the class loader to use for loading components - * @param mediaTypeRegistry the media type registry to use - * @return the Tika loader - * @throws TikaConfigException if loading or parsing fails - */ - public static TikaLoader load(Path configPath, ClassLoader classLoader, - MediaTypeRegistry mediaTypeRegistry) - throws TikaConfigException { - TikaJsonConfig config = TikaJsonConfig.load(configPath); - return new TikaLoader(config, classLoader, mediaTypeRegistry); + return new TikaLoader(config, classLoader); } /** * Loads and returns all parsers. * Results are cached - subsequent calls return the same instance. + * <p> + * Note: This method ensures EncodingDetectors are loaded first, + * as some parsers require them during construction (e.g., AbstractEncodingDetectorParser + * requires an EncodingDetector). * * @return the parser (typically a CompositeParser internally) * @throws TikaConfigException if loading fails */ public synchronized Parser loadParsers() throws TikaConfigException { if (parsers == null) { - ParserLoader loader = new ParserLoader(classLoader, objectMapper, mediaTypeRegistry); + // Load EncodingDetectors first - some parsers need them during construction + EncodingDetector encodingDetector = loadEncodingDetectors(); + + ParserLoader loader = new ParserLoader(classLoader, objectMapper, encodingDetector); parsers = loader.load(config); } return parsers; @@ -163,7 +151,7 @@ public class TikaLoader { CompositeComponentLoader<Detector> loader = new CompositeComponentLoader<>( Detector.class, "detectors", "detectors", classLoader, objectMapper); List<Detector> detectorList = loader.loadFromArray(config); - detectors = new CompositeDetector(mediaTypeRegistry, detectorList); + detectors = new CompositeDetector(getMediaTypeRegistry(), detectorList); } return detectors; } @@ -277,10 +265,15 @@ public class TikaLoader { /** * Gets the media type registry. + * Lazily loads the default registry if not already set. + * This is a static singleton shared across all TikaLoader instances. * * @return the media type registry */ - public MediaTypeRegistry getMediaTypeRegistry() { + public static synchronized MediaTypeRegistry getMediaTypeRegistry() { + if (mediaTypeRegistry == null) { + mediaTypeRegistry = MediaTypeRegistry.getDefaultRegistry(); + } return mediaTypeRegistry; } }
