This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new ec02aebb37 TIKA-4641 -- step 2: refactor serialization, further. add
docs (#2567)
ec02aebb37 is described below
commit ec02aebb37157649457cd56524fea8ff74effcf1
Author: Tim Allison <[email protected]>
AuthorDate: Sun Feb 1 10:37:56 2026 -0500
TIKA-4641 -- step 2: refactor serialization, further. add docs (#2567)
---
docs/modules/ROOT/nav.adoc | 2 +
docs/modules/ROOT/pages/developers/index.adoc | 30 ++
.../ROOT/pages/developers/serialization.adoc | 340 +++++++++++++++++++++
.../tika/annotation/TikaComponentProcessor.java | 53 +---
.../tika/serialization/ParseContextUtils.java | 19 +-
.../org/apache/tika/serialization/TikaModule.java | 22 +-
.../serdes/ParseContextDeserializer.java | 28 +-
7 files changed, 445 insertions(+), 49 deletions(-)
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 3d63a73145..89ea2c0c8c 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -36,6 +36,8 @@
** xref:advanced/setting-limits.adoc[Setting Limits]
** xref:advanced/spooling.adoc[Spooling]
** xref:advanced/embedded-documents.adoc[Embedded Document Metadata]
+* xref:developers/index.adoc[Developers]
+** xref:developers/serialization.adoc[Serialization and Configuration]
* xref:faq.adoc[FAQ]
* xref:security.adoc[Security]
* xref:roadmap.adoc[Roadmap]
diff --git a/docs/modules/ROOT/pages/developers/index.adoc
b/docs/modules/ROOT/pages/developers/index.adoc
new file mode 100644
index 0000000000..08e56a7065
--- /dev/null
+++ b/docs/modules/ROOT/pages/developers/index.adoc
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+= Developer Guide
+
+This section provides documentation for developers who want to extend Tika
+with custom parsers, detectors, and other components.
+
+== Topics
+
+* xref:serialization.adoc[Serialization and Configuration] - JSON
configuration,
+ @TikaComponent annotation, and creating custom components
+
+== Coming Soon
+
+* Creating Custom Parsers
+* Creating Custom Detectors
+* Plugin Development with PF4J
diff --git a/docs/modules/ROOT/pages/developers/serialization.adoc
b/docs/modules/ROOT/pages/developers/serialization.adoc
new file mode 100644
index 0000000000..6ec426b061
--- /dev/null
+++ b/docs/modules/ROOT/pages/developers/serialization.adoc
@@ -0,0 +1,340 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+= Serialization and Configuration
+
+Tika 4.x uses JSON-based configuration and serialization throughout the system.
+This document explains how the serialization system works and how to create
+components that integrate with it.
+
+== Overview
+
+Tika's serialization system provides:
+
+* **JSON Configuration**: Configure Tika components using JSON files
+* **Friendly Names**: Reference components by name (e.g., `pdf-parser`)
instead of class names
+* **ParseContext Serialization**: Send per-request configuration via
`FetchEmitTuple`
+* **Security**: Only registered components can be instantiated from JSON
+
+The system is built on Jackson with custom serializers/deserializers in the
+`tika-serialization` module.
+
+== JSON Configuration Format
+
+Tika uses a compact format for component configuration:
+
+[source,json]
+----
+{
+ "auto-detect-parser": {
+ "throwOnZeroBytes": false
+ },
+ "parse-context": {
+ "commons-digester-factory": {
+ "digests": [
+ { "algorithm": "MD5" },
+ { "algorithm": "SHA256" }
+ ]
+ }
+ }
+}
+----
+
+Components can be specified as:
+
+* **String**: `"pdf-parser"` - creates instance with defaults
+* **Object**: `{"pdf-parser": {"ocrStrategy": "AUTO"}}` - creates configured
instance
+
+== The @TikaComponent Annotation
+
+The `@TikaComponent` annotation is required for any class that should be
+configurable via JSON. It serves multiple purposes:
+
+1. **Registration**: Registers the class with a friendly name
+2. **Index Generation**: Creates lookup files for name-to-class resolution
+3. **SPI Registration**: Optionally registers for Java ServiceLoader
+4. **Security**: Acts as an allowlist for deserialization
+
+=== Basic Usage
+
+[source,java]
+----
+@TikaComponent
+public class MyCustomParser implements Parser {
+ // Parser implementation
+}
+----
+
+This automatically:
+
+* Generates friendly name `my-custom-parser` from the class name
+* Adds to `META-INF/tika/parsers.idx` for name lookup
+* Adds to `META-INF/services/org.apache.tika.parser.Parser` for SPI
+
+=== Annotation Attributes
+
+[cols="1,1,3"]
+|===
+| Attribute | Default | Description
+
+| `name`
+| (auto-generated)
+| Custom friendly name instead of deriving from class name
+
+| `spi`
+| `true`
+| Whether to register in `META-INF/services/` for ServiceLoader
+
+| `contextKey`
+| (auto-detected)
+| Class to use as ParseContext key (rarely needed)
+
+| `defaultFor`
+| (none)
+| Marks as default implementation for an interface
+|===
+
+=== Example with Attributes
+
+[source,java]
+----
+@TikaComponent(name = "my-parser", spi = false)
+public class MyInternalParser implements Parser {
+ // Not auto-discovered via SPI, but configurable via JSON
+}
+----
+
+== Context Key Detection
+
+When storing components in `ParseContext`, Tika needs to know which class
+to use as the lookup key. For example, `CommonsDigesterFactory` should be
+retrievable via `parseContext.get(DigesterFactory.class)`.
+
+=== Automatic Detection
+
+Tika automatically detects the context key by checking if your class implements
+one of these known interfaces:
+
+* `Parser`, `Detector`, `EncodingDetector`
+* `MetadataFilter`, `Translator`, `Renderer`
+* `DigesterFactory`, `ContentHandlerFactory`
+* `EmbeddedDocumentExtractorFactory`, `MetadataWriteLimiterFactory`
+
+[source,java]
+----
+@TikaComponent
+public class CommonsDigesterFactory implements DigesterFactory {
+ // Context key automatically detected as DigesterFactory.class
+}
+----
+
+=== Explicit Context Key
+
+For interfaces not in the auto-detection list, specify explicitly:
+
+[source,java]
+----
+@TikaComponent(contextKey = DocumentSelector.class)
+public class SkipEmbeddedDocumentSelector implements DocumentSelector { }
+----
+
+== Service Interface Categories
+
+=== First-Class Service Interfaces
+
+These are loaded via SPI and have dedicated index files:
+
+[cols="1,1"]
+|===
+| Interface | Index File
+
+| `Parser` | `parsers.idx`
+| `Detector` | `detectors.idx`
+| `EncodingDetector` | `encoding-detectors.idx`
+| `LanguageDetector` | `language-detectors.idx`
+| `Translator` | `translators.idx`
+| `Renderer` | `renderers.idx`
+| `MetadataFilter` | `metadata-filters.idx`
+|===
+
+=== ParseContext Components
+
+Components not implementing first-class interfaces go to `parse-context.idx`:
+
+* `DigesterFactory` - Digest/checksum calculation
+* `ContentHandlerFactory` - SAX content handler creation
+* `EmbeddedDocumentExtractorFactory` - Embedded document handling
+* `MetadataWriteLimiterFactory` - Metadata write limiting
+
+== Self-Configuring Components
+
+Components implementing `SelfConfiguring` handle their own configuration
+at runtime rather than during initial loading:
+
+[source,java]
+----
+@TikaComponent
+public class PDFParser extends AbstractParser implements SelfConfiguring {
+
+ private PDFParserConfig defaultConfig = new PDFParserConfig();
+
+ @Override
+ public void configure(ParseContext parseContext) {
+ PDFParserConfig config = ParseContextConfig.getConfig(
+ parseContext, "pdf-parser", PDFParserConfig.class, defaultConfig);
+ // Use config...
+ }
+}
+----
+
+Benefits:
+
+* Per-request configuration via `ParseContext`
+* Lazy loading - config only parsed when needed
+* Merging with defaults handled automatically
+
+== ParseContext Serialization
+
+`ParseContext` can be serialized to JSON for transmission (e.g., in
`FetchEmitTuple`):
+
+[source,json]
+----
+{
+ "parseContext": {
+ "pdf-parser": {
+ "ocrStrategy": "AUTO",
+ "extractInlineImages": true
+ },
+ "commons-digester-factory": {
+ "digests": [{"algorithm": "SHA256"}]
+ }
+ }
+}
+----
+
+=== Typed Section
+
+For components that need immediate deserialization (not lazy loading):
+
+[source,json]
+----
+{
+ "parseContext": {
+ "typed": {
+ "handler-config": {
+ "type": "XML",
+ "writeLimit": 100000
+ }
+ }
+ }
+}
+----
+
+== Security Model
+
+The serialization system implements a security allowlist:
+
+1. **@TikaComponent Required**: Only annotated classes are registered
+2. **Registry Lookup**: Deserialization only instantiates registered classes
+3. **No Arbitrary Classes**: Unknown class names cause errors, not
instantiation
+
+This prevents attacks where malicious JSON specifies dangerous classes
+for instantiation.
+
+[source,java]
+----
+// This will FAIL - class not registered
+{
+ "parse-context": {
+ "java.lang.Runtime": {} // Error: Unknown component
+ }
+}
+----
+
+== Creating a Custom Component
+
+Complete example of a custom metadata filter:
+
+[source,java]
+----
+package com.example.tika;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.filter.MetadataFilter;
+
+@TikaComponent
+public class UpperCaseFilter implements MetadataFilter {
+
+ private String fieldName = "title";
+
+ public void setFieldName(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ public String getFieldName() {
+ return fieldName;
+ }
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ String value = metadata.get(fieldName);
+ if (value != null) {
+ metadata.set(fieldName, value.toUpperCase());
+ }
+ }
+}
+----
+
+Configure in JSON:
+
+[source,json]
+----
+{
+ "metadata-filters": [
+ {"upper-case-filter": {"fieldName": "dc:title"}}
+ ]
+}
+----
+
+Or with defaults:
+
+[source,json]
+----
+{
+ "metadata-filters": ["upper-case-filter"]
+}
+----
+
+== Troubleshooting
+
+=== "Unknown component name" Error
+
+* Ensure class has `@TikaComponent` annotation
+* Verify annotation processing ran during compilation
+* Check that `META-INF/tika/*.idx` file exists in JAR
+
+=== Component Not Found in ParseContext
+
+* Verify you're using the correct interface type for lookup
+* Check if explicit `contextKey` is needed
+* For self-configuring components, ensure `configure()` was called
+
+=== SPI Not Loading Component
+
+* Check that `spi = true` (the default)
+* Verify `META-INF/services/` file exists
+* Ensure JAR is on classpath
diff --git
a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
index b00be101e3..132e022dfa 100644
---
a/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
+++
b/tika-annotation-processor/src/main/java/org/apache/tika/annotation/TikaComponentProcessor.java
@@ -65,9 +65,9 @@ public class TikaComponentProcessor extends AbstractProcessor
{
* Known Tika service interfaces for SPI generation.
* Only classes implementing these interfaces will have SPI files
generated.
* <p>
- * Note: DigesterFactory and ContentHandlerFactory are NOT in this map
because
- * they are parse-context components, not top-level service interfaces.
- * Their implementations go to parse-context.idx instead.
+ * Components that don't implement any of these interfaces (e.g.,
DigesterFactory,
+ * ContentHandlerFactory implementations) go to parse-context.idx instead.
+ * These should specify their contextKey explicitly via
@TikaComponent(contextKey=...).
*/
private static final Map<String, String> SERVICE_INTERFACES = new
LinkedHashMap<>();
@@ -82,18 +82,6 @@ public class TikaComponentProcessor extends
AbstractProcessor {
SERVICE_INTERFACES.put("org.apache.tika.metadata.filter.MetadataFilter",
"metadata-filters");
}
- /**
- * Interfaces whose implementations should go to parse-context.idx.
- * These are factory interfaces used via ParseContext, not loaded via SPI.
- */
- private static final Set<String> PARSE_CONTEXT_INTERFACES = Set.of(
- "org.apache.tika.digest.DigesterFactory",
- "org.apache.tika.sax.ContentHandlerFactory",
- "org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory",
- "org.apache.tika.extractor.EmbeddedDocumentExtractorFactory",
- "org.apache.tika.extractor.UnpackSelector"
- );
-
private Messager messager;
private Filer filer;
@@ -155,13 +143,8 @@ public class TikaComponentProcessor extends
AbstractProcessor {
" (SPI: " + includeSpi + ", contextKey: " + contextKey +
", defaultFor: " + defaultFor + ")");
- // Find all implemented service interfaces (both SPI and parse-context)
+ // Find all implemented service interfaces
List<String> serviceInterfaces = findServiceInterfaces(element);
- List<String> parseContextInterfaces =
findParseContextInterfaces(element);
-
- // Combine all interfaces for context key detection
- List<String> allInterfaces = new ArrayList<>(serviceInterfaces);
- allInterfaces.addAll(parseContextInterfaces);
// Build the index entry value (className or className:key=X[:default])
// Auto-detect contextKey from service interface if not explicitly
specified
@@ -169,16 +152,16 @@ public class TikaComponentProcessor extends
AbstractProcessor {
if (contextKey != null) {
// Explicit contextKey specified
indexValue = className + ":key=" + contextKey;
- } else if (allInterfaces.size() == 1) {
- // Auto-detect contextKey from single interface
- indexValue = className + ":key=" + allInterfaces.get(0);
+ } else if (serviceInterfaces.size() == 1) {
+ // Auto-detect contextKey from single service interface
+ indexValue = className + ":key=" + serviceInterfaces.get(0);
messager.printMessage(Diagnostic.Kind.NOTE,
- "Auto-detected contextKey=" + allInterfaces.get(0) + " for
" + className);
- } else if (allInterfaces.size() > 1) {
+ "Auto-detected contextKey=" + serviceInterfaces.get(0) + "
for " + className);
+ } else if (serviceInterfaces.size() > 1) {
// Multiple interfaces - warn that contextKey should be specified
messager.printMessage(Diagnostic.Kind.WARNING,
"Class " + className + " implements multiple interfaces: "
+
- allInterfaces + ". Consider specifying
@TikaComponent(contextKey=...) " +
+ serviceInterfaces + ". Consider specifying
@TikaComponent(contextKey=...) " +
"to select which one to use as ParseContext key.",
element);
}
@@ -187,9 +170,9 @@ public class TikaComponentProcessor extends
AbstractProcessor {
indexValue = indexValue + ":default";
}
- // Check if this is a parse-context component (implements a
parse-context interface
- // or doesn't implement any known service interface)
- if (!parseContextInterfaces.isEmpty() || serviceInterfaces.isEmpty()) {
+ // Components that don't implement any known service interface go to
parse-context.idx
+ // These should specify their contextKey explicitly via
@TikaComponent(contextKey=...)
+ if (serviceInterfaces.isEmpty()) {
// Put in parse-context.idx
messager.printMessage(Diagnostic.Kind.NOTE,
"Class " + className + " is a parse-context component, " +
@@ -292,16 +275,6 @@ public class TikaComponentProcessor extends
AbstractProcessor {
return result;
}
- /**
- * Finds all parse-context interfaces implemented by the given type
element.
- */
- private List<String> findParseContextInterfaces(TypeElement element) {
- List<String> result = new ArrayList<>();
- Set<String> visited = new LinkedHashSet<>();
- findInterfacesRecursive(element.asType(), result, visited,
PARSE_CONTEXT_INTERFACES);
- return result;
- }
-
/**
* Recursively searches for interfaces in the type hierarchy.
*
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
index 30826befa5..70104c9f26 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
@@ -159,18 +159,29 @@ public class ParseContextUtils {
/**
* Determines the ParseContext key for a component.
* <p>
- * The contextKey is auto-detected by the annotation processor from the
service
- * interface implemented by the component. If not detected (e.g.,
component implements
- * multiple interfaces), falls back to the component class.
+ * Resolution order:
+ * <ol>
+ * <li>Explicit contextKey from .idx file (via @TikaComponent
annotation)</li>
+ * <li>Auto-detect from implemented interfaces (using
TikaModule.COMPACT_FORMAT_INTERFACES)</li>
+ * <li>Fall back to the component class itself</li>
+ * </ol>
+ * <p>
+ * Security note: This only determines the context key - it does NOT
affect which
+ * classes can be instantiated. Classes must still be registered via
@TikaComponent.
*
* @param info the component info
* @return the class to use as ParseContext key
*/
private static Class<?> determineContextKey(ComponentInfo info) {
- // Use contextKey from .idx file (auto-detected or explicit from
@TikaComponent)
+ // Use explicit contextKey from .idx file if specified
if (info.contextKey() != null) {
return info.contextKey();
}
+ // Auto-detect from implemented interfaces at runtime
+ Class<?> contextKeyInterface =
TikaModule.findContextKeyInterface(info.componentClass());
+ if (contextKeyInterface != null) {
+ return contextKeyInterface;
+ }
// Fall back to the component class itself
return info.componentClass();
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
index 2fc6406b53..8277632830 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
@@ -52,6 +52,7 @@ import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.UnpackSelector;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.filter.MetadataFilter;
@@ -116,6 +117,7 @@ public class TikaModule extends SimpleModule {
COMPACT_FORMAT_INTERFACES.add(MetadataWriteLimiterFactory.class);
COMPACT_FORMAT_INTERFACES.add(ContentHandlerDecoratorFactory.class);
COMPACT_FORMAT_INTERFACES.add(ContentHandlerFactory.class);
+ COMPACT_FORMAT_INTERFACES.add(UnpackSelector.class);
}
/**
@@ -123,12 +125,28 @@ public class TikaModule extends SimpleModule {
* Returns true if the type implements any of the registered compact
format interfaces.
*/
private static boolean usesCompactFormat(Class<?> type) {
+ return findContextKeyInterface(type) != null;
+ }
+
+ /**
+ * Finds the appropriate context key interface for a given type.
+ * This is used to determine which interface should be used as the
ParseContext key
+ * when storing instances of this type.
+ * <p>
+ * Security note: This method only helps determine the context key - it
does NOT
+ * affect which classes can be instantiated. Classes must still be
registered
+ * via @TikaComponent to be deserializable.
+ *
+ * @param type the type to find the context key for
+ * @return the interface to use as context key, or null if none found
+ */
+ public static Class<?> findContextKeyInterface(Class<?> type) {
for (Class<?> iface : COMPACT_FORMAT_INTERFACES) {
if (iface.isAssignableFrom(type)) {
- return true;
+ return iface;
}
}
- return false;
+ return null;
}
public TikaModule() {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
index 639822cee9..bacbb40741 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
@@ -36,6 +36,7 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.config.loader.ComponentInfo;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.serialization.ComponentNameResolver;
+import org.apache.tika.serialization.TikaModule;
/**
* Deserializes ParseContext from JSON.
@@ -127,6 +128,21 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
return parseContext;
}
+ /**
+ * Determines the context key for a component.
+ * Uses explicit contextKey if available, otherwise auto-detects from
interfaces.
+ */
+ private static Class<?> determineContextKey(ComponentInfo info) {
+ if (info.contextKey() != null) {
+ return info.contextKey();
+ }
+ Class<?> interfaceKey =
TikaModule.findContextKeyInterface(info.componentClass());
+ if (interfaceKey != null) {
+ return interfaceKey;
+ }
+ return info.componentClass();
+ }
+
/**
* Checks if a JSON config entry would create a duplicate context key.
* <p>
@@ -147,7 +163,7 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
}
ComponentInfo info = infoOpt.get();
- Class<?> contextKey = info.contextKey() != null ? info.contextKey() :
info.componentClass();
+ Class<?> contextKey = determineContextKey(info);
String existingName = seenContextKeys.get(contextKey);
if (existingName != null) {
@@ -205,8 +221,14 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
}
}
- // Use contextKey if available, otherwise use the config class
itself
- Class<?> parseContextKey = (contextKeyClass != null) ?
contextKeyClass : configClass;
+ // Determine context key: explicit > interface detection > class
itself
+ Class<?> parseContextKey = contextKeyClass;
+ if (parseContextKey == null) {
+ parseContextKey =
TikaModule.findContextKeyInterface(configClass);
+ }
+ if (parseContextKey == null) {
+ parseContextKey = configClass;
+ }
// Check for duplicate context key
String existingName = seenContextKeys.get(parseContextKey);