This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a23433c4b4 TIKA-4663 - make markdown the default content handler in
tika-app, tika-server, and the async CLI (#2877)
a23433c4b4 is described below
commit a23433c4b477d13cc45b996af08dd91c9b49578e
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 20:23:39 2026 -0400
TIKA-4663 - make markdown the default content handler in tika-app,
tika-server, and the async CLI (#2877)
* TIKA-4663 - make markdown the default content handler in tika-app,
tika-server, and the async CLI
---
CHANGES.txt | 12 ++++++++++++
.../ROOT/pages/migration-to-4x/migrating-to-4x.adoc | 10 ++++++++++
docs/modules/ROOT/pages/pipes/getting-started.adoc | 2 +-
docs/modules/ROOT/pages/pipes/parse-modes.adoc | 2 +-
docs/modules/ROOT/pages/using-tika/cli/index.adoc | 8 ++++----
docs/modules/ROOT/pages/using-tika/server/index.adoc | 4 ++--
.../main/java/org/apache/tika/cli/AsyncHelper.java | 4 ++++
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 12 +++++++-----
.../java/org/apache/tika/cli/AsyncHelperTest.java | 7 +++++++
.../test/java/org/apache/tika/cli/TikaCLITest.java | 12 ++++++++----
.../apache/tika/sax/BasicContentHandlerFactory.java | 2 +-
tika-grpc/src/main/proto/tika.proto | 1 +
.../java/org/apache/tika/async/cli/TikaAsyncCLI.java | 6 +++---
.../apache/tika/async/cli/AsyncCliParserTest.java | 3 +++
.../core/resource/RecursiveMetadataResource.java | 2 +-
.../server/core/RecursiveMetadataResourceTest.java | 20 ++++++++++++++++++++
.../server/standard/RecursiveMetadataFilterTest.java | 3 ++-
.../standard/RecursiveMetadataResourceTest.java | 15 ++++++++++-----
18 files changed, 97 insertions(+), 28 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 2e7a5898a5..31c3876a0e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,15 @@
+Release 4.0.0-beta-1 - unreleased
+
+ BREAKING CHANGES
+
+ * The default content handler is now Markdown. tika-app, tika-server
+ (the /tika and /rmeta endpoints), and the async/pipes CLI now emit
+ Markdown content by default instead of XHTML/XML (plain text for the
+ async CLI). Request the previous format explicitly, e.g. tika-app
+ -x/--xml, the server /tika/xml and /rmeta/xml paths (or the
+ X-Tika-Handler header), and the async CLI --handler x (TIKA-4663).
+
+
Release 4.0.0-alpha-1 - 5/4/2026
BREAKING CHANGES
diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
index 1aed5c00fa..8ce3c69afb 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
@@ -40,6 +40,16 @@ java -jar tika-app-<version>.jar [option...] [file...]
If you have build scripts or container images that drop in just the jar,
update them to unpack the zip and run from inside it.
+== Default content handler: XHTML/XML -> Markdown
+
+In 3.x the default content handler produced XHTML/XML. In 4.x the default is
**Markdown** everywhere:
+
+* `tika-app` outputs Markdown by default (was XHTML). Pass `-x`/`--xml`,
`-h`/`--html`, or `-t`/`--text` to choose another format.
+* `tika-server` — the `/tika` and `/rmeta` endpoints return Markdown content
by default (was XHTML/XML). Use an explicit handler path (`/tika/xml`,
`/rmeta/xml`, ...) or the `X-Tika-Handler` header to choose another format.
+* The async/pipes CLI emits Markdown by default (was plain text). Use
`--handler x` (etc.) to choose another format.
+
+If you parse the extracted content programmatically and expect XHTML/XML,
request it explicitly as shown above (TIKA-4663).
+
== Configuration: XML to JSON
Tika 4.x uses JSON configuration files instead of XML. The legacy
`tika-config.xml` format
diff --git a/docs/modules/ROOT/pages/pipes/getting-started.adoc
b/docs/modules/ROOT/pages/pipes/getting-started.adoc
index db6955aeb7..4baa06c88b 100644
--- a/docs/modules/ROOT/pages/pipes/getting-started.adoc
+++ b/docs/modules/ROOT/pages/pipes/getting-started.adoc
@@ -53,7 +53,7 @@ java -jar tika-app.jar -i /data/input -o /data/output
--handler t
java -jar tika-app.jar -i /data/input -o /data/output -Z
----
-Handler types: `t` (text), `h` (html), `x` (xml), `m` (markdown), `b` (body),
`i` (ignore/metadata only).
+Handler types: `t` (text), `h` (html), `x` (xml), `m` (markdown), `b` (body),
`i` (ignore/metadata only). The default is `m` (markdown).
== JSON Configuration
diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
index ab81227ac9..1bcc401a24 100644
--- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -69,7 +69,7 @@ and applies to all modes that produce content (`RMETA`,
`CONCATENATE`, `CONTENT_
Accepted `type` values: `TEXT`, `HTML`, `XML`, `MARKDOWN`, `BODY`, `IGNORE`.
The CLI
`--handler` flag uses single-letter shortcuts (`t`, `h`, `x`, `m`, `b`, `i`)
that map onto
-these values.
+these values. If unset, the default is `MARKDOWN`.
[cols="1,1,2"]
|===
diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index 55e7b5aa75..d7c1fe7d17 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -118,7 +118,7 @@ options see <<_tika_pipes_processing,Tika Pipes
Processing>> below.
|Option |Description
|`-x` or `--xml`
-|Output XHTML content (default)
+|Output XHTML content
|`-h` or `--html`
|Output HTML content
@@ -127,7 +127,7 @@ options see <<_tika_pipes_processing,Tika Pipes
Processing>> below.
|Output plain text content (body)
|`--md`
-|Output Markdown content (body)
+|Output Markdown content (body) (default)
|`-T` or `--text-main`
|Output plain text — main content only, via the boilerpipe handler
@@ -145,7 +145,7 @@ options see <<_tika_pipes_processing,Tika Pipes
Processing>> below.
|Output metadata in XMP
|`-J` or `--jsonRecursive`
-|Output metadata and content from all embedded files. Combine with
`-x`/`-h`/`-t`/`-m` to choose the content type (default: `-x`).
+|Output metadata and content from all embedded files. Combine with
`-x`/`-h`/`-t`/`-m` to choose the content type (default: `--md`).
|`-r` or `--pretty-print`
|For JSON, XML, and XHTML output, add newlines and whitespace for readability.
@@ -378,7 +378,7 @@ This processes all files in the input directory and writes
JSON metadata
|Option |Description
|`--handler=<X>`
-|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body,
`i`=ignore. Default: `t`.
+|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body,
`i`=ignore. Default: `m`.
|`--concatenate`
|Concatenate content from all embedded documents into a single content field.
diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc
b/docs/modules/ROOT/pages/using-tika/server/index.adoc
index e630f97887..0c92d3e60f 100644
--- a/docs/modules/ROOT/pages/using-tika/server/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc
@@ -117,7 +117,7 @@ For the root `/tika` PUT endpoint you can also pick the
handler with a header:
curl -T document.pdf -H "X-Tika-Handler: markdown" http://localhost:9998/tika
----
-Accepted values: `text`, `html`, `xml`, `markdown` (or `md`), `body`, `ignore`.
+Accepted values: `text`, `html`, `xml`, `markdown` (or `md`), `body`,
`ignore`. The default is `markdown`.
=== Recursive Metadata (`/rmeta`)
@@ -126,7 +126,7 @@ array of metadata objects. The handler controls the content
field of each entry:
[source,bash]
----
-curl -T document.pdf http://localhost:9998/rmeta # default: text
+curl -T document.pdf http://localhost:9998/rmeta # default: markdown
curl -T document.pdf http://localhost:9998/rmeta/text
curl -T document.pdf http://localhost:9998/rmeta/html
curl -T document.pdf http://localhost:9998/rmeta/xml
diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
index df99107871..3314298958 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
@@ -78,6 +78,10 @@ public class AsyncHelper {
// Translate TikaCLI xml output to the TikaAsyncCLI handler
type.
argList.add("--handler");
argList.add("x");
+ } else if (arg.equals("--md")) {
+ // Translate TikaCLI markdown output to the TikaAsyncCLI
handler type.
+ argList.add("--handler");
+ argList.add("m");
} else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
// TikaAsyncCLI always outputs JSON with recursive metadata
(RMETA mode)
// This is already the default, so we just skip this arg
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index b420db0ac2..5bea9f0da6 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -243,7 +243,7 @@ public class TikaCLI {
return getTransformerHandler(output, "xml", encoding, prettyPrint);
}
};
- private OutputType type = XML;
+ private OutputType type = MARKDOWN;
private final OutputType HTML = new OutputType() {
@Override
protected ContentHandler getContentHandler(OutputStream output,
Metadata metadata) throws Exception {
@@ -793,6 +793,8 @@ public class TikaCLI {
handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
} else if (type.equals(TEXT_MAIN)) {
handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+ } else if (type.equals(MARKDOWN)) {
+ handlerType = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
} else if (type.equals(METADATA)) {
handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
}
@@ -823,10 +825,10 @@ public class TikaCLI {
out.println(" writing the JSON to stdout. Redirect to save,
e.g.:");
out.println(" --convert-config-xml-to-json=tika-config.xml >
tika-config.json");
out.println("");
- out.println(" -x or --xml Output XHTML content
(default)");
+ out.println(" -x or --xml Output XHTML content");
out.println(" -h or --html Output HTML content");
out.println(" -t or --text Output plain text content
(body)");
- out.println(" --md Output Markdown content
(body)");
+ out.println(" --md Output Markdown content (body)
(default)");
out.println(" -T or --text-main Output plain text content
(main content only via boilerpipe handler)");
out.println(" -A or --text-all Output all text content");
out.println(" -m or --metadata Output only metadata");
@@ -834,7 +836,7 @@ public class TikaCLI {
out.println(" -y or --xmp Output metadata in XMP");
out.println(" -J or --jsonRecursive Output metadata and content
from all");
out.println(" embedded files (choose content
type");
- out.println(" with -x, -h, -t or -m; default
is -x)");
+ out.println(" with -x, -h, -t or -m; default
is --md)");
out.println(" -a or --async Run Tika in async mode; must
specify details in a" + " tikaConfig file");
out.println(" -l or --language Output only language");
out.println(" -d or --detect Detect document type");
@@ -915,7 +917,7 @@ public class TikaCLI {
out.println(" -c, --config=<file> Tika config file
(--config=<file> also accepted)");
out.println(" -p, --pluginsDir Plugins directory");
out.println(" --fileList File list (one path per
line, relative to -i or absolute)");
- out.println(" --handler Handler type: t=text,
h=html, x=xml, m=markdown, b=body, i=ignore");
+ out.println(" --handler Handler type: t=text,
h=html, x=xml, m=markdown, b=body, i=ignore (default: m)");
out.println(" --concatenate Concatenate content from
all embedded documents");
out.println(" --content-only Output only extracted
content (no JSON wrapper); implies --concatenate");
out.println(" --on-exists Behavior when an output
file exists: exception (default), replace, skip");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
index bb668b7660..e59434b519 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
@@ -57,6 +57,13 @@ public class AsyncHelperTest {
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
+ @Test
+ public void testMarkdownHandler() throws Exception {
+ String[] args = new String[]{"--md", "input", "output"};
+ String[] expected = new String[]{"--handler", "m", "input", "output"};
+ assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+ }
+
@Test
public void testExtractLongFormTranslatedToZ() throws Exception {
// TIKA-4736: tika-app's --extract is the long form of -z. It must be
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 51889ac259..0cdb516f11 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -259,7 +259,7 @@ public class TikaCLITest {
@Test
public void testExtractJavascript() throws Exception {
- String json = getParamOutContent("-J", resourcePrefix +
"testPDFPackage.pdf");
+ String json = getParamOutContent("-J", "-x", resourcePrefix +
"testPDFPackage.pdf");
assertTrue(json.contains("type=\\\"PDActionJavaScript\\\""));
assertTrue(json.contains("MACRO"));
assertTrue(json.contains("NAMES_TREE"));
@@ -341,7 +341,7 @@ public class TikaCLITest {
*/
@Test
public void testListMetModels() throws Exception {
- String content = getParamOutContent("--list-met-models",
resourcePrefix + "alice.cli.test");
+ String content = getParamOutContent("--list-met-models", "-x",
resourcePrefix + "alice.cli.test");
assertTrue(content.contains("text/plain"));
}
@@ -663,7 +663,7 @@ public class TikaCLITest {
@Test
public void testConfig() throws Exception {
- String content = getParamOutContent("--config=" +
CONFIGS_DIR.toString() + "/tika-config1.json", resourcePrefix + "bad_xml.xml");
+ String content = getParamOutContent("--config=" +
CONFIGS_DIR.toString() + "/tika-config1.json", "-x", resourcePrefix +
"bad_xml.xml");
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.JSoupParser"));
}
@@ -679,8 +679,12 @@ public class TikaCLITest {
@Test
public void testJsonRecursiveMetadataParserDefault() throws Exception {
+ // TIKA-4663: default handler is markdown, so recursive content is
markdown, not XHTML.
String content = getParamOutContent("-J", "-r", resourcePrefix +
"test_recursive_embedded.docx");
- assertTrue(content.contains("\"X-TIKA:content\" : \"<html
xmlns=\\\"http://www.w3.org/1999/xhtml"));
+ assertFalse(content.contains("<html
xmlns=\\\"http://www.w3.org/1999/xhtml"),
+ "default recursive content should be markdown, not XHTML");
+ assertTrue(content.contains("# embed1.zip"),
+ "default recursive content should be markdown (heading
syntax)");
}
@Test
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 53d382b2e0..a7907fe3b7 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -47,7 +47,7 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
/**
* No-arg constructor for bean-style configuration (e.g., Jackson
deserialization).
- * Creates a factory with TEXT handler type, unlimited write, and
throwOnWriteLimitReached=true.
+ * Creates a factory with MARKDOWN handler type, unlimited write, and
throwOnWriteLimitReached=true.
*/
public BasicContentHandlerFactory() {
}
diff --git a/tika-grpc/src/main/proto/tika.proto
b/tika-grpc/src/main/proto/tika.proto
index 0a64f37115..1979b2b162 100644
--- a/tika-grpc/src/main/proto/tika.proto
+++ b/tika-grpc/src/main/proto/tika.proto
@@ -101,6 +101,7 @@ message FetchAndParseRequest {
// The ID of the emitter to use (optional). If not provided, no emitter will
be used.
string emitter_id = 4;
// Optional JSON object to configure the ParseContext for this request,
overriding server defaults.
+ // When unset, the content handler defaults to markdown.
// Keys are parse-context component names; values are their JSON configs.
// Example: {"basic-content-handler-factory": {"type": "HTML"},
"timeout-limits": {"progressTimeoutMillis": 30000}}
// See the parse-context.idx component registry for available component
names.
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index fb19447111..d9b96d359b 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -71,7 +71,7 @@ public class TikaAsyncCLI {
options.addOption(null, "Xmx", true, "heap for the forked clients,
e.g. --Xmx 1g");
options.addOption("h", "help", false, "this help message");
options.addOption("T", "timeoutMs", true, "timeout for each parse in
milliseconds");
- options.addOption(null, "handler", true, "handler type: t=text,
h=html, x=xml, m=markdown, b=body, i=ignore");
+ options.addOption(null, "handler", true, "handler type: t=text,
h=html, x=xml, m=markdown, b=body, i=ignore (default: m)");
options.addOption("p", "pluginsDir", true, "plugins directory");
options.addOption("l", "fileList", true,
"file containing one path per line (relative to inputDir or
absolute)");
@@ -164,7 +164,7 @@ public class TikaAsyncCLI {
if (args.length == 2 && ! args[0].startsWith("-")) {
return new SimpleAsyncConfig(args[0], args[1], 1,
30000L, "-Xmx1g", null, null,
- BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+ BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
SimpleAsyncConfig.ExtractBytesMode.NONE, null);
}
@@ -185,7 +185,7 @@ public class TikaAsyncCLI {
String tikaConfig = null;
String asyncConfig = null;
String pluginsDir = null;
- BasicContentHandlerFactory.HANDLER_TYPE handlerType =
BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType =
BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
SimpleAsyncConfig.ExtractBytesMode extractBytesMode =
SimpleAsyncConfig.ExtractBytesMode.NONE;
if (line.hasOption("i")) {
inputDir = line.getOptionValue("i");
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
index b8960b7c08..975506989e 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
@@ -46,6 +46,8 @@ public class AsyncCliParserTest {
assertEquals(1, simpleAsyncConfig.getNumClients());
assertEquals(30000L, simpleAsyncConfig.getTimeoutMs());
assertEquals("-Xmx1g", simpleAsyncConfig.getXmx());
+ // TIKA-4663: default content handler is markdown
+ assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
simpleAsyncConfig.getHandlerType());
simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new String[]{"-o",
"output", "-i", "input"});
assertEquals("input", simpleAsyncConfig.getInputDir());
@@ -54,6 +56,7 @@ public class AsyncCliParserTest {
assertNull(simpleAsyncConfig.getNumClients());
assertNull(simpleAsyncConfig.getTimeoutMs());
assertNull(simpleAsyncConfig.getXmx());
+ assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
simpleAsyncConfig.getHandlerType());
simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new
String[]{"-output", "output", "-input", "input"});
assertEquals("input", simpleAsyncConfig.getInputDir());
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index a129e47f30..2f33377762 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -51,7 +51,7 @@ import org.apache.tika.server.core.MetadataList;
public class RecursiveMetadataResource {
protected static final String HANDLER_TYPE_PARAM = "handler";
- protected static final BasicContentHandlerFactory.HANDLER_TYPE
DEFAULT_HANDLER_TYPE = BasicContentHandlerFactory.HANDLER_TYPE.XML;
+ protected static final BasicContentHandlerFactory.HANDLER_TYPE
DEFAULT_HANDLER_TYPE = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
private static final Logger LOG =
LoggerFactory.getLogger(RecursiveMetadataResource.class);
/**
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
index 106db976d9..930225db26 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.server.core;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
import java.io.InputStream;
import java.io.InputStreamReader;
@@ -71,6 +72,25 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
assertContains("null pointer message",
metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION));
}
+
+ @Test
+ public void testDefaultHandlerIsMarkdown() throws Exception {
+ // TIKA-4663: /rmeta with no handler now defaults to markdown (was
xml).
+ String defaultContent = rmetaContent("");
+ assertEquals(rmetaContent("/markdown"), defaultContent,
+ "default /rmeta handler should be markdown");
+ assertNotEquals(rmetaContent("/xml"), defaultContent,
+ "default /rmeta handler should no longer be xml");
+ }
+
+ private String rmetaContent(String handlerSuffix) throws Exception {
+ Response response = WebClient
+ .create(endPoint + META_PATH + handlerSuffix)
+ .accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
+ Reader reader = new InputStreamReader((InputStream)
response.getEntity(), UTF_8);
+ return
JsonMetadataList.fromJson(reader).get(0).get(TikaCoreProperties.TIKA_CONTENT);
+ }
/*
@Test
public void testWriteLimitInAll() throws Exception {
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataFilterTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataFilterTest.java
index c8e9c76b6a..99e05f75ba 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataFilterTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataFilterTest.java
@@ -68,8 +68,9 @@ public class RecursiveMetadataFilterTest extends CXFTestBase {
@Test
public void testBasicFilter() throws Exception {
+ // request xml so the response exceeds the server's gzip threshold
(markdown is too short)
Response response = WebClient
- .create(endPoint + META_PATH)
+ .create(endPoint + META_PATH + "/xml")
.accept("application/json")
.acceptEncoding("gzip")
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index d17ca0f889..936e202f1a 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -264,7 +264,8 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
.get(6)
.get(TikaCoreProperties.TIKA_CONTENT)
.trim();
- assertTrue(content.startsWith("<html
xmlns=\"http://www.w3.org/1999/xhtml\">"));
+ assertFalse(content.startsWith("<html"));
+ assertContains("plundered our seas", content);
//extra slash
response = WebClient
@@ -278,7 +279,8 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
.get(6)
.get(TikaCoreProperties.TIKA_CONTENT)
.trim();
- assertTrue(content.startsWith("<html
xmlns=\"http://www.w3.org/1999/xhtml\">"));
+ assertFalse(content.startsWith("<html"));
+ assertContains("plundered our seas", content);
//unparseable
response = WebClient
@@ -292,7 +294,8 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
.get(6)
.get(TikaCoreProperties.TIKA_CONTENT)
.trim();
- assertTrue(content.startsWith("<html
xmlns=\"http://www.w3.org/1999/xhtml\">"));
+ assertFalse(content.startsWith("<html"));
+ assertContains("plundered our seas", content);
//xml
response = WebClient
@@ -371,7 +374,8 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
.get(6)
.get(TikaCoreProperties.TIKA_CONTENT)
.trim();
- assertTrue(content.startsWith("<html
xmlns=\"http://www.w3.org/1999/xhtml\">"));
+ assertFalse(content.startsWith("<html"));
+ assertContains("plundered our seas", content);
//unparseable
attachmentPart =
@@ -389,7 +393,8 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
.get(6)
.get(TikaCoreProperties.TIKA_CONTENT)
.trim();
- assertTrue(content.startsWith("<html
xmlns=\"http://www.w3.org/1999/xhtml\">"));
+ assertFalse(content.startsWith("<html"));
+ assertContains("plundered our seas", content);
//xml
attachmentPart =