This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4641-step2 in repository https://gitbox.apache.org/repos/asf/tika.git
commit c9963eaae5aeee86b916744a69ed6de83c503b5f Author: tallison <[email protected]> AuthorDate: Sun Feb 1 09:47:39 2026 -0500 refactor serialization, further. add docs --- docs/modules/ROOT/nav.adoc | 2 + docs/modules/ROOT/pages/developers/index.adoc | 30 ++ .../ROOT/pages/developers/serialization.adoc | 340 +++++++++++++++++++++ .../tika/annotation/TikaComponentProcessor.java | 52 +--- .../tika/serialization/ParseContextUtils.java | 19 +- .../org/apache/tika/serialization/TikaModule.java | 20 +- .../serdes/ParseContextDeserializer.java | 28 +- 7 files changed, 443 insertions(+), 48 deletions(-) diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 8cf1e26a0a..cb217613e0 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -35,6 +35,8 @@ ** xref:advanced/setting-limits.adoc[Setting Limits] ** xref:advanced/spooling.adoc[Spooling] ** xref:advanced/embedded-documents.adoc[Embedded Document Metadata] +* xref:developers/index.adoc[Developers] +** xref:developers/serialization.adoc[Serialization and Configuration] * xref:faq.adoc[FAQ] * xref:security.adoc[Security] * xref:roadmap.adoc[Roadmap] diff --git a/docs/modules/ROOT/pages/developers/index.adoc b/docs/modules/ROOT/pages/developers/index.adoc new file mode 100644 index 0000000000..08e56a7065 --- /dev/null +++ b/docs/modules/ROOT/pages/developers/index.adoc @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + += Developer Guide + +This section provides documentation for developers who want to extend Tika +with custom parsers, detectors, and other components. + +== Topics + +* xref:serialization.adoc[Serialization and Configuration] - JSON configuration, + @TikaComponent annotation, and creating custom components + +== Coming Soon + +* Creating Custom Parsers +* Creating Custom Detectors +* Plugin Development with PF4J diff --git a/docs/modules/ROOT/pages/developers/serialization.adoc b/docs/modules/ROOT/pages/developers/serialization.adoc new file mode 100644 index 0000000000..6ec426b061 --- /dev/null +++ b/docs/modules/ROOT/pages/developers/serialization.adoc @@ -0,0 +1,340 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + += Serialization and Configuration + +Tika 4.x uses JSON-based configuration and serialization throughout the system. +This document explains how the serialization system works and how to create +components that integrate with it. + +== Overview + +Tika's serialization system provides: + +* **JSON Configuration**: Configure Tika components using JSON files +* **Friendly Names**: Reference components by name (e.g., `pdf-parser`) instead of class names +* **ParseContext Serialization**: Send per-request configuration via `FetchEmitTuple` +* **Security**: Only registered components can be instantiated from JSON + +The system is built on Jackson with custom serializers/deserializers in the +`tika-serialization` module. + +== JSON Configuration Format + +Tika uses a compact format for component configuration: + +[source,json] +---- +{ + "auto-detect-parser": { + "throwOnZeroBytes": false + }, + "parse-context": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "MD5" }, + { "algorithm": "SHA256" } + ] + } + } +} +---- + +Components can be specified as: + +* **String**: `"pdf-parser"` - creates instance with defaults +* **Object**: `{"pdf-parser": {"ocrStrategy": "AUTO"}}` - creates configured instance + +== The @TikaComponent Annotation + +The `@TikaComponent` annotation is required for any class that should be +configurable via JSON. It serves multiple purposes: + +1. **Registration**: Registers the class with a friendly name +2. **Index Generation**: Creates lookup files for name-to-class resolution +3. **SPI Registration**: Optionally registers for Java ServiceLoader +4. **Security**: Acts as an allowlist for deserialization + +=== Basic Usage + +[source,java] +---- +@TikaComponent +public class MyCustomParser implements Parser { + // Parser implementation +} +---- + +This automatically: + +* Generates friendly name `my-custom-parser` from the class name +* Adds to `META-INF/tika/parsers.idx` for name lookup +* Adds to `META-INF/services/org.apache.tika.parser.Parser` for SPI + +=== Annotation Attributes + +[cols="1,1,3"] +|=== +| Attribute | Default | Description + +| `name` +| (auto-generated) +| Custom friendly name instead of deriving from class name + +| `spi` +| `true` +| Whether to register in `META-INF/services/` for ServiceLoader + +| `contextKey` +| (auto-detected) +| Class to use as ParseContext key (rarely needed) + +| `defaultFor` +| (none) +| Marks as default implementation for an interface +|=== + +=== Example with Attributes + +[source,java] +---- +@TikaComponent(name = "my-parser", spi = false) +public class MyInternalParser implements Parser { + // Not auto-discovered via SPI, but configurable via JSON +} +---- + +== Context Key Detection + +When storing components in `ParseContext`, Tika needs to know which class +to use as the lookup key. For example, `CommonsDigesterFactory` should be +retrievable via `parseContext.get(DigesterFactory.class)`. + +=== Automatic Detection + +Tika automatically detects the context key by checking if your class implements +one of these known interfaces: + +* `Parser`, `Detector`, `EncodingDetector` +* `MetadataFilter`, `Translator`, `Renderer` +* `DigesterFactory`, `ContentHandlerFactory` +* `EmbeddedDocumentExtractorFactory`, `MetadataWriteLimiterFactory` + +[source,java] +---- +@TikaComponent +public class CommonsDigesterFactory implements DigesterFactory { + // Context key automatically detected as DigesterFactory.class +} +---- + +=== Explicit Context Key + +For interfaces not in the auto-detection list, specify explicitly: + +[source,java] +---- +@TikaComponent(contextKey = DocumentSelector.class) +public class SkipEmbeddedDocumentSelector implements DocumentSelector { } +---- + +== Service Interface Categories + +=== First-Class Service Interfaces + +These are loaded via SPI and have dedicated index files: + +[cols="1,1"] +|=== +| Interface | Index File + +| `Parser` | `parsers.idx` +| `Detector` | `detectors.idx` +| `EncodingDetector` | `encoding-detectors.idx` +| `LanguageDetector` | `language-detectors.idx` +| `Translator` | `translators.idx` +| `Renderer` | `renderers.idx` +| `MetadataFilter` | `metadata-filters.idx` +|=== + +=== ParseContext Components + +Components not implementing first-class interfaces go to `parse-context.idx`: + +* `DigesterFactory` - Digest/checksum calculation +* `ContentHandlerFactory` - SAX content handler creation +* `EmbeddedDocumentExtractorFactory` - Embedded document handling +* `MetadataWriteLimiterFactory` - Metadata write limiting + +== Self-Configuring Components + +Components implementing `SelfConfiguring` handle their own configuration +at runtime rather than during initial loading: + +[source,java] +---- +@TikaComponent +public class PDFParser extends AbstractParser implements SelfConfiguring { + + private PDFParserConfig defaultConfig = new PDFParserConfig(); + + @Override + public void configure(ParseContext parseContext) { + PDFParserConfig config = ParseContextConfig.getConfig( + parseContext, "pdf-parser", PDFParserConfig.class, defaultConfig); + // Use config... + } +} +---- + +Benefits: + +* Per-request configuration via `ParseContext` +* Lazy loading - config only parsed when needed +* Merging with defaults handled automatically + +== ParseContext Serialization + +`ParseContext` can be serialized to JSON for transmission (e.g., in `FetchEmitTuple`): + +[source,json] +---- +{ + "parseContext": { + "pdf-parser": { + "ocrStrategy": "AUTO", + "extractInlineImages": true + }, + "commons-digester-factory": { + "digests": [{"algorithm": "SHA256"}] + } + } +} +---- + +=== Typed Section + +For components that need immediate deserialization (not lazy loading): + +[source,json] +---- +{ + "parseContext": { + "typed": { + "handler-config": { + "type": "XML", + "writeLimit": 100000 + } + } + } +} +---- + +== Security Model + +The serialization system implements a security allowlist: + +1. **@TikaComponent Required**: Only annotated classes are registered +2. **Registry Lookup**: Deserialization only instantiates registered classes +3. **No Arbitrary Classes**: Unknown class names cause errors, not instantiation + +This prevents attacks where malicious JSON specifies dangerous classes +for instantiation. + +[source,java] +---- +// This will FAIL - class not registered +{ + "parse-context": { + "java.lang.Runtime": {} // Error: Unknown component + } +} +---- + +== Creating a Custom Component + +Complete example of a custom metadata filter: + +[source,java] +---- +package com.example.tika; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.filter.MetadataFilter; + +@TikaComponent +public class UpperCaseFilter implements MetadataFilter { + + private String fieldName = "title"; + + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } + + public String getFieldName() { + return fieldName; + } + + @Override + public void filter(Metadata metadata) throws TikaException { + String value = metadata.get(fieldName); + if (value != null) { + metadata.set(fieldName, value.toUpperCase()); + } + } +} +---- + +Configure in JSON: + +[source,json] +---- +{ + "metadata-filters": [ + {"upper-case-filter": {"fieldName": "dc:title"}} + ] +} +---- + +Or with defaults: + +[source,json] +---- +{ + "metadata-filters": ["upper-case-filter"] +} +---- + +== Troubleshooting + +=== "Unknown component name" Error + +* Ensure class has `@TikaComponent` annotation +* Verify annotation processing ran during compilation +* Check that `META-INF/tika/*.idx` file exists in JAR + +=== Component Not Found in ParseContext + +* Verify you're using the correct interface type for lookup +* Check if explicit `contextKey` is needed +* For self-configuring components, ensure `configure()` was called + +=== SPI Not Loading Component + +* Check that `spi = true` (the default) +* Verify `META-INF/services/` file exists +* Ensure JAR is on classpath diff --git a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java index bcf862c4c3..132e022dfa 100644 --- a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java +++ b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java @@ -65,9 +65,9 @@ public class TikaComponentProcessor extends AbstractProcessor { * Known Tika service interfaces for SPI generation. * Only classes implementing these interfaces will have SPI files generated. * <p> - * Note: DigesterFactory and ContentHandlerFactory are NOT in this map because - * they are parse-context components, not top-level service interfaces. - * Their implementations go to parse-context.idx instead. + * Components that don't implement any of these interfaces (e.g., DigesterFactory, + * ContentHandlerFactory implementations) go to parse-context.idx instead. + * These should specify their contextKey explicitly via @TikaComponent(contextKey=...). */ private static final Map<String, String> SERVICE_INTERFACES = new LinkedHashMap<>(); @@ -82,17 +82,6 @@ public class TikaComponentProcessor extends AbstractProcessor { SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter", "metadata-filters"); } - /** - * Interfaces whose implementations should go to parse-context.idx. - * These are factory interfaces used via ParseContext, not loaded via SPI. - */ - private static final Set<String> PARSE_CONTEXT_INTERFACES = Set.of( - "org.apache.tika.digest.DigesterFactory", - "org.apache.tika.sax.ContentHandlerFactory", - "org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory", - "org.apache.tika.extractor.EmbeddedDocumentExtractorFactory" - ); - private Messager messager; private Filer filer; @@ -154,13 +143,8 @@ public class TikaComponentProcessor extends AbstractProcessor { " (SPI: " + includeSpi + ", contextKey: " + contextKey + ", defaultFor: " + defaultFor + ")"); - // Find all implemented service interfaces (both SPI and parse-context) + // Find all implemented service interfaces List<String> serviceInterfaces = findServiceInterfaces(element); - List<String> parseContextInterfaces = findParseContextInterfaces(element); - - // Combine all interfaces for context key detection - List<String> allInterfaces = new ArrayList<>(serviceInterfaces); - allInterfaces.addAll(parseContextInterfaces); // Build the index entry value (className or className:key=X[:default]) // Auto-detect contextKey from service interface if not explicitly specified @@ -168,16 +152,16 @@ public class TikaComponentProcessor extends AbstractProcessor { if (contextKey != null) { // Explicit contextKey specified indexValue = className + ":key=" + contextKey; - } else if (allInterfaces.size() == 1) { - // Auto-detect contextKey from single interface - indexValue = className + ":key=" + allInterfaces.get(0); + } else if (serviceInterfaces.size() == 1) { + // Auto-detect contextKey from single service interface + indexValue = className + ":key=" + serviceInterfaces.get(0); messager.printMessage(Diagnostic.Kind.NOTE, - "Auto-detected contextKey=" + allInterfaces.get(0) + " for " + className); - } else if (allInterfaces.size() > 1) { + "Auto-detected contextKey=" + serviceInterfaces.get(0) + " for " + className); + } else if (serviceInterfaces.size() > 1) { // Multiple interfaces - warn that contextKey should be specified messager.printMessage(Diagnostic.Kind.WARNING, "Class " + className + " implements multiple interfaces: " + - allInterfaces + ". Consider specifying @TikaComponent(contextKey=...) " + + serviceInterfaces + ". Consider specifying @TikaComponent(contextKey=...) " + "to select which one to use as ParseContext key.", element); } @@ -186,9 +170,9 @@ public class TikaComponentProcessor extends AbstractProcessor { indexValue = indexValue + ":default"; } - // Check if this is a parse-context component (implements a parse-context interface - // or doesn't implement any known service interface) - if (!parseContextInterfaces.isEmpty() || serviceInterfaces.isEmpty()) { + // Components that don't implement any known service interface go to parse-context.idx + // These should specify their contextKey explicitly via @TikaComponent(contextKey=...) + if (serviceInterfaces.isEmpty()) { // Put in parse-context.idx messager.printMessage(Diagnostic.Kind.NOTE, "Class " + className + " is a parse-context component, " + @@ -291,16 +275,6 @@ public class TikaComponentProcessor extends AbstractProcessor { return result; } - /** - * Finds all parse-context interfaces implemented by the given type element. - */ - private List<String> findParseContextInterfaces(TypeElement element) { - List<String> result = new ArrayList<>(); - Set<String> visited = new LinkedHashSet<>(); - findInterfacesRecursive(element.asType(), result, visited, PARSE_CONTEXT_INTERFACES); - return result; - } - /** * Recursively searches for interfaces in the type hierarchy. * diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java index 30826befa5..70104c9f26 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java @@ -159,18 +159,29 @@ public class ParseContextUtils { /** * Determines the ParseContext key for a component. * <p> - * The contextKey is auto-detected by the annotation processor from the service - * interface implemented by the component. If not detected (e.g., component implements - * multiple interfaces), falls back to the component class. + * Resolution order: + * <ol> + * <li>Explicit contextKey from .idx file (via @TikaComponent annotation)</li> + * <li>Auto-detect from implemented interfaces (using TikaModule.COMPACT_FORMAT_INTERFACES)</li> + * <li>Fall back to the component class itself</li> + * </ol> + * <p> + * Security note: This only determines the context key - it does NOT affect which + * classes can be instantiated. Classes must still be registered via @TikaComponent. * * @param info the component info * @return the class to use as ParseContext key */ private static Class<?> determineContextKey(ComponentInfo info) { - // Use contextKey from .idx file (auto-detected or explicit from @TikaComponent) + // Use explicit contextKey from .idx file if specified if (info.contextKey() != null) { return info.contextKey(); } + // Auto-detect from implemented interfaces at runtime + Class<?> contextKeyInterface = TikaModule.findContextKeyInterface(info.componentClass()); + if (contextKeyInterface != null) { + return contextKeyInterface; + } // Fall back to the component class itself return info.componentClass(); } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java index 2fc6406b53..775b34a9ac 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java @@ -123,12 +123,28 @@ public class TikaModule extends SimpleModule { * Returns true if the type implements any of the registered compact format interfaces. */ private static boolean usesCompactFormat(Class<?> type) { + return findContextKeyInterface(type) != null; + } + + /** + * Finds the appropriate context key interface for a given type. + * This is used to determine which interface should be used as the ParseContext key + * when storing instances of this type. + * <p> + * Security note: This method only helps determine the context key - it does NOT + * affect which classes can be instantiated. Classes must still be registered + * via @TikaComponent to be deserializable. + * + * @param type the type to find the context key for + * @return the interface to use as context key, or null if none found + */ + public static Class<?> findContextKeyInterface(Class<?> type) { for (Class<?> iface : COMPACT_FORMAT_INTERFACES) { if (iface.isAssignableFrom(type)) { - return true; + return iface; } } - return false; + return null; } public TikaModule() { diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java index 3cc05f3272..740ff8d395 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java @@ -36,6 +36,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.config.loader.ComponentInfo; import org.apache.tika.parser.ParseContext; import org.apache.tika.serialization.ComponentNameResolver; +import org.apache.tika.serialization.TikaModule; /** * Deserializes ParseContext from JSON. @@ -127,6 +128,21 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { return parseContext; } + /** + * Determines the context key for a component. + * Uses explicit contextKey if available, otherwise auto-detects from interfaces. + */ + private static Class<?> determineContextKey(ComponentInfo info) { + if (info.contextKey() != null) { + return info.contextKey(); + } + Class<?> interfaceKey = TikaModule.findContextKeyInterface(info.componentClass()); + if (interfaceKey != null) { + return interfaceKey; + } + return info.componentClass(); + } + /** * Checks if a JSON config entry would create a duplicate context key. * <p> @@ -147,7 +163,7 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { } ComponentInfo info = infoOpt.get(); - Class<?> contextKey = info.contextKey() != null ? info.contextKey() : info.componentClass(); + Class<?> contextKey = determineContextKey(info); String existingName = seenContextKeys.get(contextKey); if (existingName != null) { @@ -205,8 +221,14 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { } } - // Use contextKey if available, otherwise use the config class itself - Class<?> parseContextKey = (contextKeyClass != null) ? contextKeyClass : configClass; + // Determine context key: explicit > interface detection > class itself + Class<?> parseContextKey = contextKeyClass; + if (parseContextKey == null) { + parseContextKey = TikaModule.findContextKeyInterface(configClass); + } + if (parseContextKey == null) { + parseContextKey = configClass; + } // Check for duplicate context key String existingName = seenContextKeys.get(parseContextKey);
