This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8dc960aa15 TIKA-4663 - add content handler type metadata and switch
default to markdown (#2611)
8dc960aa15 is described below
commit 8dc960aa154a475901f3a880f258a815e12e0304
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 18 16:06:28 2026 -0500
TIKA-4663 - add content handler type metadata and switch default to
markdown (#2611)
---
.../java/org/apache/tika/metadata/TikaCoreProperties.java | 12 +++++++++++-
.../org/apache/tika/sax/BasicContentHandlerFactory.java | 7 ++++++-
.../java/org/apache/tika/sax/ContentHandlerFactory.java | 15 +++++++++++++++
.../apache/tika/sax/RecursiveParserWrapperHandler.java | 2 ++
.../org/apache/tika/async/cli/AsyncProcessorTest.java | 4 ++++
.../org/apache/tika/pipes/core/server/ParseHandler.java | 2 ++
.../java/org/apache/tika/config/loader/TikaLoader.java | 8 +++++---
7 files changed, 45 insertions(+), 5 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index ef3575f6e3..fdd52259e3 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -102,9 +102,19 @@ public interface TikaCoreProperties {
Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX +
"parse_time_millis");
/**
- * Simple class name of the content handler
+ * Simple class name of the content handler.
+ * @deprecated Use {@link #TIKA_CONTENT_HANDLER_TYPE} for the handler type
enum value.
*/
+ @Deprecated
Property TIKA_CONTENT_HANDLER = Property.internalText(TIKA_META_PREFIX +
"content_handler");
+
+ /**
+ * The handler type used to produce {@link #TIKA_CONTENT}.
+ * Value is the {@link
org.apache.tika.sax.BasicContentHandlerFactory.HANDLER_TYPE}
+ * enum name (e.g. {@code TEXT}, {@code MARKDOWN}, {@code HTML}, {@code
XML}).
+ */
+ Property TIKA_CONTENT_HANDLER_TYPE =
+ Property.internalText(TIKA_META_PREFIX + "content_handler_type");
Property TIKA_CONTENT = Property.internalText(TIKA_META_PREFIX +
"content");
/**
* Use this to store parse exception information in the Metadata object.
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index ddef58d96e..337eba15ab 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -39,7 +39,7 @@ import org.apache.tika.parser.ParseContext;
@TikaComponent(defaultFor = ContentHandlerFactory.class)
public class BasicContentHandlerFactory implements
StreamingContentHandlerFactory, WriteLimiter {
- private HANDLER_TYPE type = HANDLER_TYPE.TEXT;
+ private HANDLER_TYPE type = HANDLER_TYPE.MARKDOWN;
private int writeLimit = -1;
private boolean throwOnWriteLimitReached = true;
private transient ParseContext parseContext;
@@ -227,6 +227,11 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
return type;
}
+ @Override
+ public String handlerTypeName() {
+ return type.name();
+ }
+
/**
* Sets the handler type.
* @param type the handler type
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
index 4c7efd7231..2dfe49912d 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
@@ -39,4 +39,19 @@ public interface ContentHandlerFactory extends Serializable {
* @return a new ContentHandler instance
*/
ContentHandler createHandler();
+
+ /**
+ * Returns the name of the handler type produced by this factory
+ * (e.g. {@code TEXT}, {@code MARKDOWN}, {@code HTML}, {@code XML}).
+ * <p>
+ * This value is written to
+ * {@link
org.apache.tika.metadata.TikaCoreProperties#TIKA_CONTENT_HANDLER_TYPE}
+ * so that downstream components (such as the inference pipeline) can
+ * determine what format {@code tika:content} is in without guessing.
+ *
+ * @return handler type name, never {@code null}
+ */
+ default String handlerTypeName() {
+ return "UNKNOWN";
+ }
}
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 3284020426..9294dcaf42 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -158,6 +158,8 @@ public class RecursiveParserWrapperHandler extends
AbstractRecursiveParserWrappe
metadata.add(TikaCoreProperties.TIKA_CONTENT, content);
metadata.add(TikaCoreProperties.TIKA_CONTENT_HANDLER,
handler.getClass().getSimpleName());
+ metadata.set(TikaCoreProperties.TIKA_CONTENT_HANDLER_TYPE,
+ getContentHandlerFactory().handlerTypeName());
}
}
}
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
index 0f31c1c1b8..585bf4b905 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
@@ -50,6 +50,8 @@ import org.apache.tika.pipes.api.pipesiterator.PipesIterator;
import org.apache.tika.pipes.core.PipesException;
import org.apache.tika.pipes.core.async.AsyncProcessor;
import org.apache.tika.pipes.core.extractor.UnpackConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.serialization.JsonMetadataList;
/**
@@ -121,6 +123,8 @@ public class AsyncProcessorTest extends TikaTest {
ParseContext parseContext = new ParseContext();
parseContext.set(ParseMode.class, ParseMode.UNPACK);
parseContext.set(UnpackConfig.class, unpackConfig);
+ parseContext.set(ContentHandlerFactory.class,
+ new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
FetchEmitTuple t =
new FetchEmitTuple("myId-1", new FetchKey("fsf", "mock.xml"),
new EmitKey("fse-json", "emit-1"), new Metadata(),
parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
index 8385631ae4..79d233ba4e 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
@@ -251,6 +251,8 @@ class ParseHandler {
LOG.warn("parse exception: " + fetchEmitTuple.getId(), e);
} finally {
metadata.add(TikaCoreProperties.TIKA_CONTENT, handler.toString());
+ metadata.set(TikaCoreProperties.TIKA_CONTENT_HANDLER_TYPE,
+ contentHandlerFactory.handlerTypeName());
if (containerException != null) {
metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
containerException);
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 36b0f69325..23bca1686e 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@ -286,7 +286,7 @@ public class TikaLoader {
/**
* Loads and returns the content handler factory.
* If "content-handler-factory" section exists in config, uses that
factory.
- * If section missing, returns a default BasicContentHandlerFactory with
TEXT handler.
+ * If section missing, returns a default BasicContentHandlerFactory with
MARKDOWN handler.
* Results are cached - subsequent calls return the same instance.
*
* <p>Example JSON:
@@ -315,10 +315,12 @@ public class TikaLoader {
throw new TikaConfigException("Failed to load
content-handler-factory", e);
}
}
- // Default to BasicContentHandlerFactory with TEXT handler if not
configured
+ // Default to BasicContentHandlerFactory with MARKDOWN handler if
not configured.
+ // Markdown preserves structural boundaries (headings, lists, code
blocks)
+ // which enables higher-quality chunking in the inference pipeline.
if (contentHandlerFactory == null) {
contentHandlerFactory = new BasicContentHandlerFactory(
- BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1);
+ BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, -1);
}
}
return contentHandlerFactory;