This is an automated email from the ASF dual-hosted git repository.

epugh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 585b1d00a00 SOLR-17867: Export Tool should export regular docs cleanly 
in .json, .jsonl, and javabin (second take) (#2636)
585b1d00a00 is described below

commit 585b1d00a0053b9fadcaecd82082667a5a255169
Author: Eric Pugh <ep...@opensourceconnections.com>
AuthorDate: Sat Aug 16 08:09:04 2025 -0400

    SOLR-17867: Export Tool should export regular docs cleanly in .json, 
.jsonl, and javabin (second take) (#2636)
    
    * Properly handle the format parameter and the compress parameter
    * Added new integration (bats) tests.
    * Refactored sink code to reduce duplication.
    
    Does NOT deal with nested documents properly.  That issue remains.
---
 .../ExportTool.java => ExportTool.java.original    |   0
 solr/CHANGES.txt                                   |   2 +
 .../src/java/org/apache/solr/cli/ExportTool.java   | 135 +++++++++------------
 .../test/org/apache/solr/cli/TestExportTool.java   |  75 ++++++++++--
 solr/packaging/test/test_export.bats               |   5 +-
 .../pages/solr-control-script-reference.adoc       |  28 +++--
 6 files changed, 145 insertions(+), 100 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java 
b/ExportTool.java.original
similarity index 100%
copy from solr/core/src/java/org/apache/solr/cli/ExportTool.java
copy to ExportTool.java.original
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3baf170ec67..73dd813599c 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -51,6 +51,8 @@ Improvements
 
 * SOLR-17852: Migrate Schema Designer to use FileStore API instead of 
BlobHandler for persisting working data. (Eric Pugh)
 
+* SOLR-17867: Export tool should properly output exported documents in json, 
json w/ lines, and javabin formats. (Eric Pugh)
+
 Optimizations
 ---------------------
 * SOLR-17568: The CLI bin/solr export tool now contacts the appropriate nodes 
directly for data instead of proxying through one.
diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java 
b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
index 050092c2017..c742c00999c 100644
--- a/solr/core/src/java/org/apache/solr/cli/ExportTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
@@ -219,12 +219,20 @@ public class ExportTool extends ToolBase {
       } else if (Files.isDirectory(Path.of(this.out))) {
         this.out = this.out + "/" + coll;
       }
-      this.out = this.out + '.' + this.format;
-      if (compress) {
+      if (!hasExtension(this.out)) {
+        this.out = this.out + '.' + this.format;
+      }
+      if (compress & !this.out.endsWith(".gz")) {
         this.out = this.out + ".gz";
       }
     }
 
+    public static boolean hasExtension(String filename) {
+      return filename.contains(".json")
+          || filename.contains(".jsonl")
+          || filename.contains(".javabin");
+    }
+
     DocsSink getSink() {
       DocsSink docSink = null;
       switch (format) {
@@ -311,6 +319,51 @@ public class ExportTool extends ToolBase {
     Info info;
     OutputStream fos;
 
+    /** Process a SolrDocument into a Map, handling special fields and date 
conversion. */
+    protected Map<String, Object> processDocument(SolrDocument doc) {
+      Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size());
+      doc.forEach(
+          (s, field) -> {
+            if (s.equals("_version_") || s.equals("_roor_")) return;
+            if (field instanceof List) {
+              if (((List<?>) field).size() == 1) {
+                field = ((List<?>) field).get(0);
+              }
+            }
+            field = constructDateStr(field);
+            if (field instanceof List<?> list) {
+              if (hasDate(list)) {
+                ArrayList<Object> listCopy = new ArrayList<>(list.size());
+                for (Object o : list) listCopy.add(constructDateStr(o));
+                field = listCopy;
+              }
+            }
+            m.put(s, field);
+          });
+      return m;
+    }
+
+    /** Check if a list contains any Date objects */
+    protected boolean hasDate(List<?> list) {
+      boolean hasDate = false;
+      for (Object o : list) {
+        if (o instanceof Date) {
+          hasDate = true;
+          break;
+        }
+      }
+      return hasDate;
+    }
+
+    /** Convert Date objects to ISO formatted strings */
+    protected Object constructDateStr(Object field) {
+      if (field instanceof Date) {
+        field =
+            DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date) 
field).getTime()));
+      }
+      return field;
+    }
+
     abstract void start() throws IOException;
 
     @SuppressForbidden(reason = "Command line tool prints out to console")
@@ -356,49 +409,12 @@ public class ExportTool extends ToolBase {
     @Override
     public synchronized void accept(SolrDocument doc) throws IOException {
       charArr.reset();
-      Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size());
-      doc.forEach(
-          (s, field) -> {
-            if (s.equals("_version_") || s.equals("_roor_")) return;
-            if (field instanceof List) {
-              if (((List<?>) field).size() == 1) {
-                field = ((List<?>) field).get(0);
-              }
-            }
-            field = constructDateStr(field);
-            if (field instanceof List<?> list) {
-              if (hasdate(list)) {
-                ArrayList<Object> listCopy = new ArrayList<>(list.size());
-                for (Object o : list) listCopy.add(constructDateStr(o));
-                field = listCopy;
-              }
-            }
-            m.put(s, field);
-          });
+      Map<String, Object> m = processDocument(doc);
       jsonWriter.write(m);
       writer.write(charArr.getArray(), charArr.getStart(), charArr.getEnd());
       writer.append('\n');
       super.accept(doc);
     }
-
-    private boolean hasdate(List<?> list) {
-      boolean hasDate = false;
-      for (Object o : list) {
-        if (o instanceof Date) {
-          hasDate = true;
-          break;
-        }
-      }
-      return hasDate;
-    }
-
-    private Object constructDateStr(Object field) {
-      if (field instanceof Date) {
-        field =
-            DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date) 
field).getTime()));
-      }
-      return field;
-    }
   }
 
   static class JsonSink extends DocsSink {
@@ -435,25 +451,7 @@ public class ExportTool extends ToolBase {
     @Override
     public synchronized void accept(SolrDocument doc) throws IOException {
       charArr.reset();
-      Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size());
-      doc.forEach(
-          (s, field) -> {
-            if (s.equals("_version_") || s.equals("_roor_")) return;
-            if (field instanceof List) {
-              if (((List<?>) field).size() == 1) {
-                field = ((List<?>) field).get(0);
-              }
-            }
-            field = constructDateStr(field);
-            if (field instanceof List<?> list) {
-              if (hasdate(list)) {
-                ArrayList<Object> listCopy = new ArrayList<>(list.size());
-                for (Object o : list) listCopy.add(constructDateStr(o));
-                field = listCopy;
-              }
-            }
-            m.put(s, field);
-          });
+      Map<String, Object> m = processDocument(doc);
       if (firstDoc) {
         firstDoc = false;
       } else {
@@ -464,25 +462,6 @@ public class ExportTool extends ToolBase {
       writer.append('\n');
       super.accept(doc);
     }
-
-    private boolean hasdate(List<?> list) {
-      boolean hasDate = false;
-      for (Object o : list) {
-        if (o instanceof Date) {
-          hasDate = true;
-          break;
-        }
-      }
-      return hasDate;
-    }
-
-    private Object constructDateStr(Object field) {
-      if (field instanceof Date) {
-        field =
-            DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date) 
field).getTime()));
-      }
-      return field;
-    }
   }
 
   static class JavabinSink extends DocsSink {
diff --git a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java 
b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
index f74eedbb0d8..d356c439849 100644
--- a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
+++ b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
@@ -19,6 +19,7 @@ package org.apache.solr.cli;
 
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
@@ -28,6 +29,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.function.Predicate;
+import java.util.zip.GZIPInputStream;
 import org.apache.lucene.tests.util.TestUtil;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.client.solrj.SolrClient;
@@ -50,6 +52,31 @@ import org.junit.Test;
 @SolrTestCaseJ4.SuppressSSL
 public class TestExportTool extends SolrCloudTestCase {
 
+  public void testOutputFormatToFileNameMapping() {
+
+    ToolRuntime runtime = new CLITestHelper.TestingRuntime(false);
+    String url = "http://example:8983/solr/mycollection";;
+    ExportTool.Info info = new ExportTool.MultiThreadedRunner(runtime, url, 
null);
+
+    info.setOutFormat(null, "json", false);
+    assertEquals("mycollection.json", info.out);
+
+    info.setOutFormat(null, "jsonl", false);
+    assertEquals("mycollection.jsonl", info.out);
+
+    info.setOutFormat(null, "javabin", false);
+    assertEquals("mycollection.javabin", info.out);
+
+    String tempFile = createTempDir() + "/myoutput.json";
+    info.setOutFormat(tempFile, "json", false);
+    assertEquals(tempFile, info.out);
+
+    // test with compression
+    tempFile = createTempDir() + "/myoutput.myoutput.json.gz";
+    info.setOutFormat(tempFile, "json", true);
+    assertEquals(tempFile, info.out);
+  }
+
   @Test
   public void testBasic() throws Exception {
     String COLLECTION_NAME = "globalLoaderColl";
@@ -92,7 +119,8 @@ public class TestExportTool extends SolrCloudTestCase {
       info.fields = "id,desc_s,a_dt";
       info.exportDocs();
 
-      assertJsonDocsCount(info, 200, record -> 
"2019-09-30T05:58:03Z".equals(record.get("a_dt")));
+      assertJsonLinesDocsCount(
+          info, 200, record -> 
"2019-09-30T05:58:03Z".equals(record.get("a_dt")));
 
       info = new ExportTool.MultiThreadedRunner(runtime, url, null);
       absolutePath =
@@ -102,7 +130,7 @@ public class TestExportTool extends SolrCloudTestCase {
       info.fields = "id,desc_s";
       info.exportDocs();
 
-      assertJsonDocsCount(info, 1000, null);
+      assertJsonLinesDocsCount(info, 1000, null);
 
       info = new ExportTool.MultiThreadedRunner(runtime, url, null);
       absolutePath =
@@ -131,7 +159,7 @@ public class TestExportTool extends SolrCloudTestCase {
       info.fields = "id,desc_s";
       info.exportDocs();
 
-      assertJsonDocsCount2(info, 200);
+      assertJsonDocsCount(info, 200);
 
       info = new ExportTool.MultiThreadedRunner(runtime, url, null);
       absolutePath =
@@ -141,7 +169,7 @@ public class TestExportTool extends SolrCloudTestCase {
       info.fields = "id,desc_s";
       info.exportDocs();
 
-      assertJsonDocsCount2(info, 1000);
+      assertJsonDocsCount(info, 1000);
 
     } finally {
       cluster.shutdown();
@@ -197,11 +225,9 @@ public class TestExportTool extends SolrCloudTestCase {
       assertEquals(docCount, totalDocsFromCores);
 
       ToolRuntime runtime = new CLITestHelper.TestingRuntime(false);
-      ExportTool.MultiThreadedRunner info;
-      String absolutePath;
 
-      info = new ExportTool.MultiThreadedRunner(runtime, url, null);
-      absolutePath =
+      ExportTool.MultiThreadedRunner info = new 
ExportTool.MultiThreadedRunner(runtime, url, null);
+      String absolutePath =
           baseDir.resolve(COLLECTION_NAME + random().nextInt(100000) + 
".javabin").toString();
       info.setOutFormat(absolutePath, "javabin", false);
       info.setLimit("-1");
@@ -211,6 +237,7 @@ public class TestExportTool extends SolrCloudTestCase {
         assertEquals(
             e.getValue().longValue(), 
info.corehandlers.get(e.getKey()).receivedDocs.get());
       }
+
       info = new ExportTool.MultiThreadedRunner(runtime, url, null);
       absolutePath =
           baseDir.resolve(COLLECTION_NAME + random().nextInt(100000) + 
".jsonl").toString();
@@ -280,7 +307,7 @@ public class TestExportTool extends SolrCloudTestCase {
     }
   }
 
-  private void assertJsonDocsCount2(ExportTool.Info info, int expected) {
+  private void assertJsonDocsCount(ExportTool.Info info, int expected) {
     assertTrue(
         "" + info.docsWritten.get() + " expected " + expected, 
info.docsWritten.get() >= expected);
   }
@@ -310,4 +337,34 @@ public class TestExportTool extends SolrCloudTestCase {
       rdr.close();
     }
   }
+
+  private void assertJsonLinesDocsCount(
+      ExportTool.Info info, int expected, Predicate<Map<String, Object>> 
predicate)
+      throws IOException {
+    assertTrue(
+        "" + info.docsWritten.get() + " expected " + expected, 
info.docsWritten.get() >= expected);
+
+    JsonRecordReader jsonReader;
+    Reader rdr;
+    jsonReader = JsonRecordReader.getInst("/", List.of("$FQN:/**"));
+    InputStream is = new FileInputStream(info.out);
+    if (info.compress) {
+      is = new GZIPInputStream(is);
+    }
+    rdr = new InputStreamReader(is, StandardCharsets.UTF_8);
+    try {
+      int[] count = new int[] {0};
+      jsonReader.streamRecords(
+          rdr,
+          (record, path) -> {
+            if (predicate != null) {
+              assertTrue(predicate.test(record));
+            }
+            count[0]++;
+          });
+      assertTrue(count[0] >= expected);
+    } finally {
+      rdr.close();
+    }
+  }
 }
diff --git a/solr/packaging/test/test_export.bats 
b/solr/packaging/test/test_export.bats
index cb089ecd5fe..f497fde4839 100644
--- a/solr/packaging/test/test_export.bats
+++ b/solr/packaging/test/test_export.bats
@@ -45,9 +45,8 @@ teardown() {
   assert [ -e techproducts.javabin ]
   rm techproducts.javabin
 
-  # old pattern of putting a suffix on the output that controlled the format 
no longer supported ;-).
-  run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts 
--query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output.javabin"
-  assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.json ]
+  run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts 
--query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output.json"
+  assert [ -e ${BATS_TEST_TMPDIR}/output.json ]
 
   run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts 
--query "*:* -id:test" --output "${BATS_TEST_TMPDIR}"
   assert [ -e ${BATS_TEST_TMPDIR}/techproducts.json ]
diff --git 
a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
 
b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
index 48fe8e18347..842d294b06e 100644
--- 
a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
+++ 
b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
@@ -1619,12 +1619,12 @@ Examples of this command:
 
 === Exporting Documents to a File
 
-The `export` command will allow you to export documents from a collection in 
JSON, JSON with Lines, or Javabin format.
-All documents can be exported, or only those that match a query.
+The `export` command will allow you to export documents from a collection in 
JSON, https://jsonlines.org/[JSON Lines], or Javabin format.
+All documents can be exported, or only those that match a query.  You may need 
to wrap some parameters with quotes.
 
 NOTE: This hasn't been tested with nested child documents and your results 
will vary.
 
-NOTE: The `export` command only works with in a Solr running in cloud mode.
+NOTE: The `export` command only works with Solr running in cloud mode.
 
 `bin/solr export [options]`
 
@@ -1661,9 +1661,11 @@ Name of the collection to run an export against.
 |Optional |Default: `json`
 |===
 +
-The file format of the export, `json`, `jsonl`, or `javabin`.
-Choosing `javabin` exports in the native Solr format, and is compact and fast 
to import.
-`jsonl` is the Json with Lines format, learn more at https://jsonlines.org/.
+The file format of the export, `json` (default) or `jsonl` or `javabin`, this 
also specifies the file extension to be used.
+`json` and `jsonl` both export documents in the same format as using 
`wt=json`.  The `json` output file is suitable for
+immediately posting back to solr via the `/update/json` endpoint.  `jsonl` 
outputs each Solr document on it's own line,
+and it useful for parallel processing tasks. Learn more at 
https://jsonlines.org/.
+Choosing `javabin` exports in the native binary Solr format and is compact and 
faster to import.
 
 `--output <path>`::
 +
@@ -1673,6 +1675,7 @@ Choosing `javabin` exports in the native Solr format, and 
is compact and fast to
 |===
 +
 Either the path to the directory for the exported data to be written to, or a 
specific file to be written out.
+If the file name ends with `.gz` the output will be compressed into a .gz file.
 +
 If only a directory is specified then the file will be created with the name 
of the collection, as in `<collection>.json`.
 
@@ -1692,7 +1695,7 @@ If you specify `--compress` then the resulting outputting 
file with will be gzip
 |Optional |Default: `\*:*`
 |===
 +
-A custom query.
+A custom query to select documents for exporting.
 The default is `\*:*` which will export all documents.
 
 `--fields <fields>`::
@@ -1727,7 +1730,7 @@ This parameter is unnecessary if `SOLR_AUTH_TYPE` is 
defined in `solr.in.sh` or
 
 *Examples*
 
-Export all documents from a collection `gettingstarted`:
+Export all documents from a collection `gettingstarted` into a file called 
`gettingstarted.json`:
 
 [source,bash]
 bin/solr export --solr-url http://localhost:8983 -c gettingstarted --limit -1
@@ -1741,7 +1744,12 @@ bin/solr export --solr-url http://localhost:8983 -c 
gettingstarted --limit -1 --
 
 === Importing Documents into a Collection
 
-Once you have exported documents in a file, you can use the 
xref:indexing-guide:indexing-with-update-handlers.adoc[/update request handler] 
to import them to a new Solr collection.
+Once you have exported documents in a file, you can use the 
xref:indexing-guide:indexing-with-update-handlers.adoc#json-formatted-index-updates[/update
 request handler] to import them to a new Solr collection.
+Notice the different endpoints used depending on the format.  
+
+*Example: import `json` files*
+
+`curl -X POST --header "Content-Type: application/json" -d 
@gettingstarted.json 
http://localhost:8983/solr/gettingstarted/update/json?commit=true`
 
 *Example: import `json` files*
 
@@ -1763,7 +1771,7 @@ Now import the data with either of these methods:
 
 [,console]
 ----
-$ curl -X POST -d @gettingstarted.json 
'http://localhost:8983/solr/test_collection/update/json/docs?commit=true'
+$ curl -X POST --header "Content-Type: application/json" -d 
@gettingstarted.json 
'http://localhost:8983/solr/test_collection/update/json/docs?commit=true'
 ----
 or
 [,console]

Reply via email to