This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4734
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 0b0dc4f75c5adbc3bf014c8796e609512572b917
Author: tallison <[email protected]>
AuthorDate: Thu May 21 06:34:08 2026 -0400

    TIKA-4734 -- fix xml config converter
---
 .../pages/migration-to-4x/migrating-to-4x.adoc     |  5 ++-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java | 44 ++++++++++++++--------
 .../test/java/org/apache/tika/cli/TikaCLITest.java | 19 ++++++++++
 3 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc 
b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
index 34ef91d778..7cca66ec39 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
@@ -36,9 +36,12 @@ Tika provides a conversion tool in `tika-app` to help 
migrate your XML configura
 
 [source,bash]
 ----
-java -jar tika-app.jar 
--convert-config-xml-to-json=tika-config.xml,tika-config.json
+java -jar tika-app.jar --convert-config-xml-to-json=tika-config.xml > 
tika-config.json
 ----
 
+The converted JSON is written to standard output, so redirect it to the file 
of your choice
+(as shown above). No separate `--config` argument is needed.
+
 The converter currently supports:
 
 * **Parsers section** - parser declarations with parameters and exclusions
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 82be748314..80999eee21 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -56,6 +56,7 @@ import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.io.output.CloseShieldOutputStream;
 import org.apache.logging.log4j.Level;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -364,6 +365,15 @@ public class TikaCLI {
 
     private boolean testForAsync(String[] args) {
 
+        // Standalone utility flags are handled directly by process(), never 
via async mode.
+        // (Without this guard, --convert-config-xml-to-json=in.xml would be 
misread as a
+        // ".json"/batch arg and routed to async, failing with a 
TikaConfigException - TIKA-4734.)
+        for (String arg : args) {
+            if (arg.startsWith("--convert-config-xml-to-json=")) {
+                return false;
+            }
+        }
+
         // Single .json file is a config file for async mode
         if (args.length == 1 && args[0].endsWith(".json")) {
             return true;
@@ -576,27 +586,27 @@ public class TikaCLI {
         System.out.println(localConfig.getConfig().toString());
     }*/
 
-    private void convertConfigXmlToJson(String paths) throws Exception {
-        String[] parts = paths.split(",");
-        if (parts.length != 2) {
-            System.err.println("Error: --convert-config-xml-to-json requires 
input and output paths separated by comma");
-            System.err.println("Usage: 
--convert-config-xml-to-json=<input.xml>,<output.json>");
+    private void convertConfigXmlToJson(String inputPath) throws Exception {
+        if (inputPath == null || inputPath.trim().isEmpty()) {
+            System.err.println("Error: --convert-config-xml-to-json requires 
an input XML path");
+            System.err.println("Usage: 
--convert-config-xml-to-json=<input.xml> > <output.json>");
             return;
         }
 
-        Path xmlPath = Paths.get(parts[0].trim());
-        Path jsonPath = Paths.get(parts[1].trim());
+        Path xmlPath = Paths.get(inputPath.trim());
 
         if (!Files.exists(xmlPath)) {
             System.err.println("Error: Input XML file not found: " + xmlPath);
             return;
         }
 
-        try {
-            XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
-            System.out.println("Successfully converted XML config to JSON:");
-            System.out.println("  Input:  " + xmlPath.toAbsolutePath());
-            System.out.println("  Output: " + jsonPath.toAbsolutePath());
+        // Write JSON to stdout so the user can redirect it (e.g. > 
tika-config.json).
+        // Informational/diagnostic output from the converter goes to the 
logger (stderr),
+        // keeping stdout clean for the JSON payload. The converter closes the 
stream it
+        // is handed, so shield System.out from being closed out from under us.
+        try (InputStream in = Files.newInputStream(xmlPath)) {
+            XmlToJsonConfigConverter.convert(in, 
CloseShieldOutputStream.wrap(System.out));
+            System.out.flush();
         } catch (Exception e) {
             System.err.println("Error converting config: " + e.getMessage());
             throw e;
@@ -743,16 +753,18 @@ public class TikaCLI {
         out.println();
         out.println("    -g  or --gui           Start the Apache Tika GUI");
         out.println();
-        out.println("    --config=<tika-config.xml>");
-        out.println("        TikaConfig file. Must be specified before -g, -s, 
-f or the dump-x-config !");
+        out.println("    --config=<tika-config.json>");
+        out.println("        TikaConfig file (JSON as of Tika 4.x). Must be 
specified before -g, -s or -f !");
         // TODO: TIKA-XXXX - Re-enable config dump options once JSON 
serialization is complete
         // These options are not yet implemented in 4.x due to the migration 
from XML to JSON config
         // out.println("    --dump-minimal-config  Print minimal TikaConfig");
         // out.println("    --dump-current-config  Print current TikaConfig");
         // out.println("    --dump-static-config   Print static config");
         // out.println("    --dump-static-full-config  Print static explicit 
config");
-        out.println("    
--convert-config-xml-to-json=<input.xml>,<output.json>");
-        out.println("        Convert legacy XML config to JSON format (parsers 
section only)");
+        out.println("    --convert-config-xml-to-json=<input.xml>");
+        out.println("        Convert a legacy 3.x XML config to 4.x JSON 
format (parsers section only),");
+        out.println("        writing the JSON to stdout. Redirect to save, 
e.g.:");
+        out.println("        --convert-config-xml-to-json=tika-config.xml > 
tika-config.json");
         out.println("");
         out.println("    -x  or --xml           Output XHTML content 
(default)");
         out.println("    -h  or --html          Output HTML content");
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 5498f3f056..396e174057 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -764,6 +764,25 @@ public class TikaCLITest {
      * reset outContent and errContent if they are not empty
      * run given params in TikaCLI and return outContent String with UTF-8
      */
+    /**
+     * Tests --convert-config-xml-to-json with no separate config file.
+     * Regression test for TIKA-4734: the flag used to be misrouted to async
+     * mode (the input arg ended in ".json"), failing with a 
TikaConfigException
+     * unless a --config was also passed. It must now run standalone and write
+     * the converted JSON to stdout.
+     */
+    @Test
+    public void testConvertConfigXmlToJson() throws Exception {
+        String xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI()).toString();
+        String content = getParamOutContent("--convert-config-xml-to-json=" + 
xmlPath);
+
+        // stdout should contain the converted JSON (and only the JSON)
+        assertTrue(content.contains("\"parsers\""), "Expected JSON parsers 
section, got: " + content);
+        assertTrue(content.contains("pdf-parser"), "Expected pdf-parser in 
output, got: " + content);
+        assertTrue(content.contains("\"sortByPosition\" : true"), "Expected 
converted param, got: " + content);
+        assertTrue(content.trim().startsWith("{"), "Output should be pure 
JSON, got: " + content);
+    }
+
     String getParamOutContent(String... params) throws Exception {
         resetContent();
         TikaCLI.main(params);

Reply via email to