This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new a23433c4b4 TIKA-4663 - make markdown the default content handler in 
tika-app, tika-server, and the async CLI (#2877)
a23433c4b4 is described below

commit a23433c4b477d13cc45b996af08dd91c9b49578e
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 20:23:39 2026 -0400

    TIKA-4663 - make markdown the default content handler in tika-app, 
tika-server, and the async CLI (#2877)
    
    * TIKA-4663 - make markdown the default content handler in tika-app, 
tika-server, and the async CLI
---
 CHANGES.txt                                          | 12 ++++++++++++
 .../ROOT/pages/migration-to-4x/migrating-to-4x.adoc  | 10 ++++++++++
 docs/modules/ROOT/pages/pipes/getting-started.adoc   |  2 +-
 docs/modules/ROOT/pages/pipes/parse-modes.adoc       |  2 +-
 docs/modules/ROOT/pages/using-tika/cli/index.adoc    |  8 ++++----
 docs/modules/ROOT/pages/using-tika/server/index.adoc |  4 ++--
 .../main/java/org/apache/tika/cli/AsyncHelper.java   |  4 ++++
 .../src/main/java/org/apache/tika/cli/TikaCLI.java   | 12 +++++++-----
 .../java/org/apache/tika/cli/AsyncHelperTest.java    |  7 +++++++
 .../test/java/org/apache/tika/cli/TikaCLITest.java   | 12 ++++++++----
 .../apache/tika/sax/BasicContentHandlerFactory.java  |  2 +-
 tika-grpc/src/main/proto/tika.proto                  |  1 +
 .../java/org/apache/tika/async/cli/TikaAsyncCLI.java |  6 +++---
 .../apache/tika/async/cli/AsyncCliParserTest.java    |  3 +++
 .../core/resource/RecursiveMetadataResource.java     |  2 +-
 .../server/core/RecursiveMetadataResourceTest.java   | 20 ++++++++++++++++++++
 .../server/standard/RecursiveMetadataFilterTest.java |  3 ++-
 .../standard/RecursiveMetadataResourceTest.java      | 15 ++++++++++-----
 18 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 2e7a5898a5..31c3876a0e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,15 @@
+Release 4.0.0-beta-1 - unreleased
+
+  BREAKING CHANGES
+
+   * The default content handler is now Markdown. tika-app, tika-server
+     (the /tika and /rmeta endpoints), and the async/pipes CLI now emit
+     Markdown content by default instead of XHTML/XML (plain text for the
+     async CLI). Request the previous format explicitly, e.g. tika-app
+     -x/--xml, the server /tika/xml and /rmeta/xml paths (or the
+     X-Tika-Handler header), and the async CLI --handler x (TIKA-4663).
+
+
 Release 4.0.0-alpha-1 - 5/4/2026
 
   BREAKING CHANGES
diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc 
b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
index 1aed5c00fa..8ce3c69afb 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
@@ -40,6 +40,16 @@ java -jar tika-app-<version>.jar [option...] [file...]
 
 If you have build scripts or container images that drop in just the jar, 
update them to unpack the zip and run from inside it.
 
+== Default content handler: XHTML/XML -> Markdown
+
+In 3.x the default content handler produced XHTML/XML. In 4.x the default is 
**Markdown** everywhere:
+
+* `tika-app` outputs Markdown by default (was XHTML). Pass `-x`/`--xml`, 
`-h`/`--html`, or `-t`/`--text` to choose another format.
+* `tika-server` — the `/tika` and `/rmeta` endpoints return Markdown content 
by default (was XHTML/XML). Use an explicit handler path (`/tika/xml`, 
`/rmeta/xml`, ...) or the `X-Tika-Handler` header to choose another format.
+* The async/pipes CLI emits Markdown by default (was plain text). Use 
`--handler x` (etc.) to choose another format.
+
+If you parse the extracted content programmatically and expect XHTML/XML, 
request it explicitly as shown above (TIKA-4663).
+
 == Configuration: XML to JSON
 
 Tika 4.x uses JSON configuration files instead of XML. The legacy 
`tika-config.xml` format
diff --git a/docs/modules/ROOT/pages/pipes/getting-started.adoc 
b/docs/modules/ROOT/pages/pipes/getting-started.adoc
index db6955aeb7..4baa06c88b 100644
--- a/docs/modules/ROOT/pages/pipes/getting-started.adoc
+++ b/docs/modules/ROOT/pages/pipes/getting-started.adoc
@@ -53,7 +53,7 @@ java -jar tika-app.jar -i /data/input -o /data/output 
--handler t
 java -jar tika-app.jar -i /data/input -o /data/output -Z
 ----
 
-Handler types: `t` (text), `h` (html), `x` (xml), `m` (markdown), `b` (body), 
`i` (ignore/metadata only).
+Handler types: `t` (text), `h` (html), `x` (xml), `m` (markdown), `b` (body), 
`i` (ignore/metadata only). The default is `m` (markdown).
 
 == JSON Configuration
 
diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc 
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
index ab81227ac9..1bcc401a24 100644
--- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -69,7 +69,7 @@ and applies to all modes that produce content (`RMETA`, 
`CONCATENATE`, `CONTENT_
 
 Accepted `type` values: `TEXT`, `HTML`, `XML`, `MARKDOWN`, `BODY`, `IGNORE`. 
The CLI
 `--handler` flag uses single-letter shortcuts (`t`, `h`, `x`, `m`, `b`, `i`) 
that map onto
-these values.
+these values. If unset, the default is `MARKDOWN`.
 
 [cols="1,1,2"]
 |===
diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc 
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index 55e7b5aa75..d7c1fe7d17 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -118,7 +118,7 @@ options see <<_tika_pipes_processing,Tika Pipes 
Processing>> below.
 |Option |Description
 
 |`-x` or `--xml`
-|Output XHTML content (default)
+|Output XHTML content
 
 |`-h` or `--html`
 |Output HTML content
@@ -127,7 +127,7 @@ options see <<_tika_pipes_processing,Tika Pipes 
Processing>> below.
 |Output plain text content (body)
 
 |`--md`
-|Output Markdown content (body)
+|Output Markdown content (body) (default)
 
 |`-T` or `--text-main`
 |Output plain text — main content only, via the boilerpipe handler
@@ -145,7 +145,7 @@ options see <<_tika_pipes_processing,Tika Pipes 
Processing>> below.
 |Output metadata in XMP
 
 |`-J` or `--jsonRecursive`
-|Output metadata and content from all embedded files. Combine with 
`-x`/`-h`/`-t`/`-m` to choose the content type (default: `-x`).
+|Output metadata and content from all embedded files. Combine with 
`-x`/`-h`/`-t`/`-m` to choose the content type (default: `--md`).
 
 |`-r` or `--pretty-print`
 |For JSON, XML, and XHTML output, add newlines and whitespace for readability.
@@ -378,7 +378,7 @@ This processes all files in the input directory and writes 
JSON metadata
 |Option |Description
 
 |`--handler=<X>`
-|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, 
`i`=ignore. Default: `t`.
+|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, 
`i`=ignore. Default: `m`.
 
 |`--concatenate`
 |Concatenate content from all embedded documents into a single content field.
diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc 
b/docs/modules/ROOT/pages/using-tika/server/index.adoc
index e630f97887..0c92d3e60f 100644
--- a/docs/modules/ROOT/pages/using-tika/server/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc
@@ -117,7 +117,7 @@ For the root `/tika` PUT endpoint you can also pick the 
handler with a header:
 curl -T document.pdf -H "X-Tika-Handler: markdown" http://localhost:9998/tika
 ----
 
-Accepted values: `text`, `html`, `xml`, `markdown` (or `md`), `body`, `ignore`.
+Accepted values: `text`, `html`, `xml`, `markdown` (or `md`), `body`, 
`ignore`. The default is `markdown`.
 
 === Recursive Metadata (`/rmeta`)
 
@@ -126,7 +126,7 @@ array of metadata objects. The handler controls the content 
field of each entry:
 
 [source,bash]
 ----
-curl -T document.pdf http://localhost:9998/rmeta            # default: text
+curl -T document.pdf http://localhost:9998/rmeta            # default: markdown
 curl -T document.pdf http://localhost:9998/rmeta/text
 curl -T document.pdf http://localhost:9998/rmeta/html
 curl -T document.pdf http://localhost:9998/rmeta/xml
diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java 
b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
index df99107871..3314298958 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
@@ -78,6 +78,10 @@ public class AsyncHelper {
                 // Translate TikaCLI xml output to the TikaAsyncCLI handler 
type.
                 argList.add("--handler");
                 argList.add("x");
+            } else if (arg.equals("--md")) {
+                // Translate TikaCLI markdown output to the TikaAsyncCLI 
handler type.
+                argList.add("--handler");
+                argList.add("m");
             } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
                 // TikaAsyncCLI always outputs JSON with recursive metadata 
(RMETA mode)
                 // This is already the default, so we just skip this arg
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index b420db0ac2..5bea9f0da6 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -243,7 +243,7 @@ public class TikaCLI {
             return getTransformerHandler(output, "xml", encoding, prettyPrint);
         }
     };
-    private OutputType type = XML;
+    private OutputType type = MARKDOWN;
     private final OutputType HTML = new OutputType() {
         @Override
         protected ContentHandler getContentHandler(OutputStream output, 
Metadata metadata) throws Exception {
@@ -793,6 +793,8 @@ public class TikaCLI {
             handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
         } else if (type.equals(TEXT_MAIN)) {
             handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+        } else if (type.equals(MARKDOWN)) {
+            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
         } else if (type.equals(METADATA)) {
             handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
         }
@@ -823,10 +825,10 @@ public class TikaCLI {
         out.println("        writing the JSON to stdout. Redirect to save, 
e.g.:");
         out.println("        --convert-config-xml-to-json=tika-config.xml > 
tika-config.json");
         out.println("");
-        out.println("    -x  or --xml           Output XHTML content 
(default)");
+        out.println("    -x  or --xml           Output XHTML content");
         out.println("    -h  or --html          Output HTML content");
         out.println("    -t  or --text          Output plain text content 
(body)");
-        out.println("    --md                   Output Markdown content 
(body)");
+        out.println("    --md                   Output Markdown content (body) 
(default)");
         out.println("    -T  or --text-main     Output plain text content 
(main content only via boilerpipe handler)");
         out.println("    -A  or --text-all      Output all text content");
         out.println("    -m  or --metadata      Output only metadata");
@@ -834,7 +836,7 @@ public class TikaCLI {
         out.println("    -y  or --xmp           Output metadata in XMP");
         out.println("    -J  or --jsonRecursive Output metadata and content 
from all");
         out.println("                           embedded files (choose content 
type");
-        out.println("                           with -x, -h, -t or -m; default 
is -x)");
+        out.println("                           with -x, -h, -t or -m; default 
is --md)");
         out.println("    -a  or --async         Run Tika in async mode; must 
specify details in a" + " tikaConfig file");
         out.println("    -l  or --language      Output only language");
         out.println("    -d  or --detect        Detect document type");
@@ -915,7 +917,7 @@ public class TikaCLI {
         out.println("    -c, --config=<file>        Tika config file 
(--config=<file> also accepted)");
         out.println("    -p, --pluginsDir           Plugins directory");
         out.println("    --fileList                 File list (one path per 
line, relative to -i or absolute)");
-        out.println("    --handler                  Handler type: t=text, 
h=html, x=xml, m=markdown, b=body, i=ignore");
+        out.println("    --handler                  Handler type: t=text, 
h=html, x=xml, m=markdown, b=body, i=ignore (default: m)");
         out.println("    --concatenate              Concatenate content from 
all embedded documents");
         out.println("    --content-only             Output only extracted 
content (no JSON wrapper); implies --concatenate");
         out.println("    --on-exists                Behavior when an output 
file exists: exception (default), replace, skip");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java 
b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
index bb668b7660..e59434b519 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
@@ -57,6 +57,13 @@ public class AsyncHelperTest {
         assertArrayEquals(expected, AsyncHelper.translateArgs(args));
     }
 
+    @Test
+    public void testMarkdownHandler() throws Exception {
+        String[] args = new String[]{"--md", "input", "output"};
+        String[] expected = new String[]{"--handler", "m", "input", "output"};
+        assertArrayEquals(expected, AsyncHelper.translateArgs(args));
+    }
+
     @Test
     public void testExtractLongFormTranslatedToZ() throws Exception {
         // TIKA-4736: tika-app's --extract is the long form of -z. It must be
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 51889ac259..0cdb516f11 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -259,7 +259,7 @@ public class TikaCLITest {
 
     @Test
     public void testExtractJavascript() throws Exception {
-        String json = getParamOutContent("-J", resourcePrefix + 
"testPDFPackage.pdf");
+        String json = getParamOutContent("-J", "-x", resourcePrefix + 
"testPDFPackage.pdf");
         assertTrue(json.contains("type=\\\"PDActionJavaScript\\\""));
         assertTrue(json.contains("MACRO"));
         assertTrue(json.contains("NAMES_TREE"));
@@ -341,7 +341,7 @@ public class TikaCLITest {
      */
     @Test
     public void testListMetModels() throws Exception {
-        String content = getParamOutContent("--list-met-models", 
resourcePrefix + "alice.cli.test");
+        String content = getParamOutContent("--list-met-models", "-x", 
resourcePrefix + "alice.cli.test");
         assertTrue(content.contains("text/plain"));
     }
 
@@ -663,7 +663,7 @@ public class TikaCLITest {
 
     @Test
     public void testConfig() throws Exception {
-        String content = getParamOutContent("--config=" + 
CONFIGS_DIR.toString() + "/tika-config1.json", resourcePrefix + "bad_xml.xml");
+        String content = getParamOutContent("--config=" + 
CONFIGS_DIR.toString() + "/tika-config1.json", "-x", resourcePrefix + 
"bad_xml.xml");
         assertTrue(content.contains("apple"));
         
assertTrue(content.contains("org.apache.tika.parser.html.JSoupParser"));
     }
@@ -679,8 +679,12 @@ public class TikaCLITest {
 
     @Test
     public void testJsonRecursiveMetadataParserDefault() throws Exception {
+        // TIKA-4663: default handler is markdown, so recursive content is 
markdown, not XHTML.
         String content = getParamOutContent("-J", "-r", resourcePrefix + 
"test_recursive_embedded.docx");
-        assertTrue(content.contains("\"X-TIKA:content\" : \"<html 
xmlns=\\\"http://www.w3.org/1999/xhtml";));
+        assertFalse(content.contains("<html 
xmlns=\\\"http://www.w3.org/1999/xhtml";),
+                "default recursive content should be markdown, not XHTML");
+        assertTrue(content.contains("# embed1.zip"),
+                "default recursive content should be markdown (heading 
syntax)");
     }
 
     @Test
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java 
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 53d382b2e0..a7907fe3b7 100644
--- 
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -47,7 +47,7 @@ public class BasicContentHandlerFactory implements 
StreamingContentHandlerFactor
 
     /**
      * No-arg constructor for bean-style configuration (e.g., Jackson 
deserialization).
-     * Creates a factory with TEXT handler type, unlimited write, and 
throwOnWriteLimitReached=true.
+     * Creates a factory with MARKDOWN handler type, unlimited write, and 
throwOnWriteLimitReached=true.
      */
     public BasicContentHandlerFactory() {
     }
diff --git a/tika-grpc/src/main/proto/tika.proto 
b/tika-grpc/src/main/proto/tika.proto
index 0a64f37115..1979b2b162 100644
--- a/tika-grpc/src/main/proto/tika.proto
+++ b/tika-grpc/src/main/proto/tika.proto
@@ -101,6 +101,7 @@ message FetchAndParseRequest {
   // The ID of the emitter to use (optional). If not provided, no emitter will 
be used.
   string emitter_id = 4;
   // Optional JSON object to configure the ParseContext for this request, 
overriding server defaults.
+  // When unset, the content handler defaults to markdown.
   // Keys are parse-context component names; values are their JSON configs.
   // Example: {"basic-content-handler-factory": {"type": "HTML"}, 
"timeout-limits": {"progressTimeoutMillis": 30000}}
   // See the parse-context.idx component registry for available component 
names.
diff --git 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index fb19447111..d9b96d359b 100644
--- 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++ 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -71,7 +71,7 @@ public class TikaAsyncCLI {
         options.addOption(null, "Xmx", true, "heap for the forked clients, 
e.g. --Xmx 1g");
         options.addOption("h", "help", false, "this help message");
         options.addOption("T", "timeoutMs", true, "timeout for each parse in 
milliseconds");
-        options.addOption(null, "handler", true, "handler type: t=text, 
h=html, x=xml, m=markdown, b=body, i=ignore");
+        options.addOption(null, "handler", true, "handler type: t=text, 
h=html, x=xml, m=markdown, b=body, i=ignore (default: m)");
         options.addOption("p", "pluginsDir", true, "plugins directory");
         options.addOption("l", "fileList", true,
                 "file containing one path per line (relative to inputDir or 
absolute)");
@@ -164,7 +164,7 @@ public class TikaAsyncCLI {
         if (args.length == 2 && ! args[0].startsWith("-")) {
             return new SimpleAsyncConfig(args[0], args[1], 1,
                     30000L, "-Xmx1g", null, null,
-                    BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                    BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN,
                     SimpleAsyncConfig.ExtractBytesMode.NONE, null);
         }
 
@@ -185,7 +185,7 @@ public class TikaAsyncCLI {
         String tikaConfig = null;
         String asyncConfig = null;
         String pluginsDir = null;
-        BasicContentHandlerFactory.HANDLER_TYPE handlerType = 
BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+        BasicContentHandlerFactory.HANDLER_TYPE handlerType = 
BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
         SimpleAsyncConfig.ExtractBytesMode extractBytesMode = 
SimpleAsyncConfig.ExtractBytesMode.NONE;
         if (line.hasOption("i")) {
             inputDir = line.getOptionValue("i");
diff --git 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
index b8960b7c08..975506989e 100644
--- 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
+++ 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncCliParserTest.java
@@ -46,6 +46,8 @@ public class AsyncCliParserTest {
         assertEquals(1, simpleAsyncConfig.getNumClients());
         assertEquals(30000L, simpleAsyncConfig.getTimeoutMs());
         assertEquals("-Xmx1g", simpleAsyncConfig.getXmx());
+        // TIKA-4663: default content handler is markdown
+        assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, 
simpleAsyncConfig.getHandlerType());
 
         simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new String[]{"-o", 
"output", "-i", "input"});
         assertEquals("input", simpleAsyncConfig.getInputDir());
@@ -54,6 +56,7 @@ public class AsyncCliParserTest {
         assertNull(simpleAsyncConfig.getNumClients());
         assertNull(simpleAsyncConfig.getTimeoutMs());
         assertNull(simpleAsyncConfig.getXmx());
+        assertEquals(BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN, 
simpleAsyncConfig.getHandlerType());
 
         simpleAsyncConfig = TikaAsyncCLI.parseCommandLine(new 
String[]{"-output", "output", "-input", "input"});
         assertEquals("input", simpleAsyncConfig.getInputDir());
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index a129e47f30..2f33377762 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -51,7 +51,7 @@ import org.apache.tika.server.core.MetadataList;
 public class RecursiveMetadataResource {
 
     protected static final String HANDLER_TYPE_PARAM = "handler";
-    protected static final BasicContentHandlerFactory.HANDLER_TYPE 
DEFAULT_HANDLER_TYPE = BasicContentHandlerFactory.HANDLER_TYPE.XML;
+    protected static final BasicContentHandlerFactory.HANDLER_TYPE 
DEFAULT_HANDLER_TYPE = BasicContentHandlerFactory.HANDLER_TYPE.MARKDOWN;
     private static final Logger LOG = 
LoggerFactory.getLogger(RecursiveMetadataResource.class);
 
     /**
diff --git 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
index 106db976d9..930225db26 100644
--- 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
+++ 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.server.core;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
 
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -71,6 +72,25 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
         assertContains("null pointer message", 
metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION));
 
     }
+
+    @Test
+    public void testDefaultHandlerIsMarkdown() throws Exception {
+        // TIKA-4663: /rmeta with no handler now defaults to markdown (was 
xml).
+        String defaultContent = rmetaContent("");
+        assertEquals(rmetaContent("/markdown"), defaultContent,
+                "default /rmeta handler should be markdown");
+        assertNotEquals(rmetaContent("/xml"), defaultContent,
+                "default /rmeta handler should no longer be xml");
+    }
+
+    private String rmetaContent(String handlerSuffix) throws Exception {
+        Response response = WebClient
+                .create(endPoint + META_PATH + handlerSuffix)
+                .accept("application/json")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
+        Reader reader = new InputStreamReader((InputStream) 
response.getEntity(), UTF_8);
+        return 
JsonMetadataList.fromJson(reader).get(0).get(TikaCoreProperties.TIKA_CONTENT);
+    }
     /*
     @Test
     public void testWriteLimitInAll() throws Exception {
diff --git 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataFilterTest.java
 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataFilterTest.java
index c8e9c76b6a..99e05f75ba 100644
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataFilterTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataFilterTest.java
@@ -68,8 +68,9 @@ public class RecursiveMetadataFilterTest extends CXFTestBase {
 
     @Test
     public void testBasicFilter() throws Exception {
+        // request xml so the response exceeds the server's gzip threshold 
(markdown is too short)
         Response response = WebClient
-                .create(endPoint + META_PATH)
+                .create(endPoint + META_PATH + "/xml")
                 .accept("application/json")
                 .acceptEncoding("gzip")
                 
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
diff --git 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index d17ca0f889..936e202f1a 100644
--- 
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++ 
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -264,7 +264,8 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
                 .get(6)
                 .get(TikaCoreProperties.TIKA_CONTENT)
                 .trim();
-        assertTrue(content.startsWith("<html 
xmlns=\"http://www.w3.org/1999/xhtml\";>"));
+        assertFalse(content.startsWith("<html"));
+        assertContains("plundered our seas", content);
 
         //extra slash
         response = WebClient
@@ -278,7 +279,8 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
                 .get(6)
                 .get(TikaCoreProperties.TIKA_CONTENT)
                 .trim();
-        assertTrue(content.startsWith("<html 
xmlns=\"http://www.w3.org/1999/xhtml\";>"));
+        assertFalse(content.startsWith("<html"));
+        assertContains("plundered our seas", content);
 
         //unparseable
         response = WebClient
@@ -292,7 +294,8 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
                 .get(6)
                 .get(TikaCoreProperties.TIKA_CONTENT)
                 .trim();
-        assertTrue(content.startsWith("<html 
xmlns=\"http://www.w3.org/1999/xhtml\";>"));
+        assertFalse(content.startsWith("<html"));
+        assertContains("plundered our seas", content);
 
         //xml
         response = WebClient
@@ -371,7 +374,8 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
                 .get(6)
                 .get(TikaCoreProperties.TIKA_CONTENT)
                 .trim();
-        assertTrue(content.startsWith("<html 
xmlns=\"http://www.w3.org/1999/xhtml\";>"));
+        assertFalse(content.startsWith("<html"));
+        assertContains("plundered our seas", content);
 
         //unparseable
         attachmentPart =
@@ -389,7 +393,8 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
                 .get(6)
                 .get(TikaCoreProperties.TIKA_CONTENT)
                 .trim();
-        assertTrue(content.startsWith("<html 
xmlns=\"http://www.w3.org/1999/xhtml\";>"));
+        assertFalse(content.startsWith("<html"));
+        assertContains("plundered our seas", content);
 
         //xml
         attachmentPart =

Reply via email to