This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e7161b7ca TIKA-4545 - clean up media registry (#2411)
e7161b7ca is described below
commit e7161b7cacd36e045b36e138c67f1da0c3fedeec
Author: Tim Allison <[email protected]>
AuthorDate: Fri Nov 28 09:51:18 2025 -0500
TIKA-4545 - clean up media registry (#2411)
---
.../apache/tika/config/loader/ParserLoader.java | 41 +++++++++++++++------
.../org/apache/tika/config/loader/TikaLoader.java | 43 +++++++++-------------
2 files changed, 48 insertions(+), 36 deletions(-)
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
index 5cfc3cd11..1ff3fccd1 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
@@ -33,9 +33,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
@@ -52,7 +53,7 @@ public class ParserLoader {
private final ClassLoader classLoader;
private final ObjectMapper objectMapper;
- private final MediaTypeRegistry mediaTypeRegistry;
+ private final EncodingDetector encodingDetector;
/**
* Holds parsed config data before decoration is applied.
@@ -71,10 +72,10 @@ public class ParserLoader {
}
public ParserLoader(ClassLoader classLoader, ObjectMapper objectMapper,
- MediaTypeRegistry mediaTypeRegistry) {
+ EncodingDetector encodingDetector) {
this.classLoader = classLoader;
this.objectMapper = objectMapper;
- this.mediaTypeRegistry = mediaTypeRegistry;
+ this.encodingDetector = encodingDetector;
}
/**
@@ -85,6 +86,8 @@ public class ParserLoader {
* @throws TikaConfigException if loading fails
*/
public CompositeParser load(TikaJsonConfig config) throws
TikaConfigException {
+ //TODO -- need to handle multiparsers at some point
+ //TODO -- add special handling for external parsers?
List<Parser> parserList = new ArrayList<>();
// Load configured parsers
@@ -185,7 +188,7 @@ public class ParserLoader {
parserList.addAll(spiParsers);
}
- return new CompositeParser(mediaTypeRegistry, parserList);
+ return new CompositeParser(TikaLoader.getMediaTypeRegistry(),
parserList);
}
private ParsedParserConfig loadConfiguredParser(String name, JsonNode
configNode,
@@ -213,16 +216,32 @@ public class ParserLoader {
throws TikaConfigException {
try {
+ Parser parser;
+
// Try constructor with String parameter (JSON config)
try {
+ //TODO -- change this from String to JsonConfig or simple
wrapper class
Constructor<?> constructor =
parserClass.getConstructor(String.class);
- return (Parser) constructor.newInstance(configJson);
+ parser = (Parser) constructor.newInstance(configJson);
} catch (NoSuchMethodException e) {
- // TODO -- entrypoint for actual configuration
- // Fall back to zero-arg constructor
- return (Parser) ServiceLoaderUtils.newInstance(parserClass,
- new org.apache.tika.config.ServiceLoader(classLoader));
+ // Try constructor with EncodingDetector parameter (for
AbstractEncodingDetectorParser)
+ if
(AbstractEncodingDetectorParser.class.isAssignableFrom(parserClass)) {
+ try {
+ Constructor<?> constructor =
parserClass.getConstructor(EncodingDetector.class);
+ parser = (Parser)
constructor.newInstance(encodingDetector);
+ } catch (NoSuchMethodException ex) {
+ // Fall back to zero-arg constructor
+ parser = (Parser)
ServiceLoaderUtils.newInstance(parserClass,
+ new
org.apache.tika.config.ServiceLoader(classLoader));
+ }
+ } else {
+ // Fall back to zero-arg constructor
+ parser = (Parser)
ServiceLoaderUtils.newInstance(parserClass,
+ new
org.apache.tika.config.ServiceLoader(classLoader));
+ }
}
+
+ return parser;
} catch (InstantiationException | IllegalAccessException |
InvocationTargetException e) {
throw new TikaConfigException("Failed to instantiate parser: " +
parserClass.getName(), e);
@@ -268,7 +287,7 @@ public class ParserLoader {
fallbackParsers.add(fallbackConfig.parser);
}
- return new FallbackParser(mediaTypeRegistry, MetadataPolicy.KEEP_ALL,
fallbackParsers);
+ return new FallbackParser(TikaLoader.getMediaTypeRegistry(),
MetadataPolicy.KEEP_ALL, fallbackParsers);
}
private List<Parser> loadSpiParsers(Set<Class<?>> excludeClasses) {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 2432ed28a..aeeaf216f 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -74,9 +74,9 @@ public class TikaLoader {
private final TikaJsonConfig config;
private final ClassLoader classLoader;
private final ObjectMapper objectMapper;
- private final MediaTypeRegistry mediaTypeRegistry;
// Cached instances (lazy loaded)
+ private static MediaTypeRegistry mediaTypeRegistry;
private Parser parsers;
private Detector detectors;
private EncodingDetector encodingDetectors;
@@ -84,12 +84,10 @@ public class TikaLoader {
private Renderer renderers;
private ConfigLoader configLoader;
- private TikaLoader(TikaJsonConfig config, ClassLoader classLoader,
- MediaTypeRegistry mediaTypeRegistry) {
+ private TikaLoader(TikaJsonConfig config, ClassLoader classLoader) {
this.config = config;
this.classLoader = classLoader;
this.objectMapper = TikaJsonConfig.getObjectMapper();
- this.mediaTypeRegistry = mediaTypeRegistry;
}
/**
@@ -114,36 +112,26 @@ public class TikaLoader {
public static TikaLoader load(Path configPath, ClassLoader classLoader)
throws TikaConfigException {
TikaJsonConfig config = TikaJsonConfig.load(configPath);
- MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry();
- return new TikaLoader(config, classLoader, registry);
- }
-
- /**
- * Loads a Tika configuration with custom media type registry.
- *
- * @param configPath the path to the JSON configuration file
- * @param classLoader the class loader to use for loading components
- * @param mediaTypeRegistry the media type registry to use
- * @return the Tika loader
- * @throws TikaConfigException if loading or parsing fails
- */
- public static TikaLoader load(Path configPath, ClassLoader classLoader,
- MediaTypeRegistry mediaTypeRegistry)
- throws TikaConfigException {
- TikaJsonConfig config = TikaJsonConfig.load(configPath);
- return new TikaLoader(config, classLoader, mediaTypeRegistry);
+ return new TikaLoader(config, classLoader);
}
/**
* Loads and returns all parsers.
* Results are cached - subsequent calls return the same instance.
+ * <p>
+ * Note: This method ensures EncodingDetectors are loaded first,
+ * as some parsers require them during construction (e.g.,
AbstractEncodingDetectorParser
+ * requires an EncodingDetector).
*
* @return the parser (typically a CompositeParser internally)
* @throws TikaConfigException if loading fails
*/
public synchronized Parser loadParsers() throws TikaConfigException {
if (parsers == null) {
- ParserLoader loader = new ParserLoader(classLoader, objectMapper,
mediaTypeRegistry);
+ // Load EncodingDetectors first - some parsers need them during
construction
+ EncodingDetector encodingDetector = loadEncodingDetectors();
+
+ ParserLoader loader = new ParserLoader(classLoader, objectMapper,
encodingDetector);
parsers = loader.load(config);
}
return parsers;
@@ -163,7 +151,7 @@ public class TikaLoader {
CompositeComponentLoader<Detector> loader = new
CompositeComponentLoader<>(
Detector.class, "detectors", "detectors", classLoader,
objectMapper);
List<Detector> detectorList = loader.loadFromArray(config);
- detectors = new CompositeDetector(mediaTypeRegistry, detectorList);
+ detectors = new CompositeDetector(getMediaTypeRegistry(),
detectorList);
}
return detectors;
}
@@ -277,10 +265,15 @@ public class TikaLoader {
/**
* Gets the media type registry.
+ * Lazily loads the default registry if not already set.
+ * This is a static singleton shared across all TikaLoader instances.
*
* @return the media type registry
*/
- public MediaTypeRegistry getMediaTypeRegistry() {
+ public static synchronized MediaTypeRegistry getMediaTypeRegistry() {
+ if (mediaTypeRegistry == null) {
+ mediaTypeRegistry = MediaTypeRegistry.getDefaultRegistry();
+ }
return mediaTypeRegistry;
}
}