This is an automated email from the ASF dual-hosted git repository. epugh pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push: new 585b1d00a00 SOLR-17867: Export Tool should export regular docs cleanly in .json, .jsonl, and javabin (second take) (#2636) 585b1d00a00 is described below commit 585b1d00a0053b9fadcaecd82082667a5a255169 Author: Eric Pugh <ep...@opensourceconnections.com> AuthorDate: Sat Aug 16 08:09:04 2025 -0400 SOLR-17867: Export Tool should export regular docs cleanly in .json, .jsonl, and javabin (second take) (#2636) * Properly handle the format parameter and the compress parameter * Added new integration (bats) tests. * Refactored sink code to reduce duplication. Does NOT deal with nested documents properly. That issue remains. --- .../ExportTool.java => ExportTool.java.original | 0 solr/CHANGES.txt | 2 + .../src/java/org/apache/solr/cli/ExportTool.java | 135 +++++++++------------ .../test/org/apache/solr/cli/TestExportTool.java | 75 ++++++++++-- solr/packaging/test/test_export.bats | 5 +- .../pages/solr-control-script-reference.adoc | 28 +++-- 6 files changed, 145 insertions(+), 100 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java b/ExportTool.java.original similarity index 100% copy from solr/core/src/java/org/apache/solr/cli/ExportTool.java copy to ExportTool.java.original diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 3baf170ec67..73dd813599c 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -51,6 +51,8 @@ Improvements * SOLR-17852: Migrate Schema Designer to use FileStore API instead of BlobHandler for persisting working data. (Eric Pugh) +* SOLR-17867: Export tool should properly output exported documents in json, json w/ lines, and javabin formats. (Eric Pugh) + Optimizations --------------------- * SOLR-17568: The CLI bin/solr export tool now contacts the appropriate nodes directly for data instead of proxying through one. diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java b/solr/core/src/java/org/apache/solr/cli/ExportTool.java index 050092c2017..c742c00999c 100644 --- a/solr/core/src/java/org/apache/solr/cli/ExportTool.java +++ b/solr/core/src/java/org/apache/solr/cli/ExportTool.java @@ -219,12 +219,20 @@ public class ExportTool extends ToolBase { } else if (Files.isDirectory(Path.of(this.out))) { this.out = this.out + "/" + coll; } - this.out = this.out + '.' + this.format; - if (compress) { + if (!hasExtension(this.out)) { + this.out = this.out + '.' + this.format; + } + if (compress & !this.out.endsWith(".gz")) { this.out = this.out + ".gz"; } } + public static boolean hasExtension(String filename) { + return filename.contains(".json") + || filename.contains(".jsonl") + || filename.contains(".javabin"); + } + DocsSink getSink() { DocsSink docSink = null; switch (format) { @@ -311,6 +319,51 @@ public class ExportTool extends ToolBase { Info info; OutputStream fos; + /** Process a SolrDocument into a Map, handling special fields and date conversion. */ + protected Map<String, Object> processDocument(SolrDocument doc) { + Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size()); + doc.forEach( + (s, field) -> { + if (s.equals("_version_") || s.equals("_roor_")) return; + if (field instanceof List) { + if (((List<?>) field).size() == 1) { + field = ((List<?>) field).get(0); + } + } + field = constructDateStr(field); + if (field instanceof List<?> list) { + if (hasDate(list)) { + ArrayList<Object> listCopy = new ArrayList<>(list.size()); + for (Object o : list) listCopy.add(constructDateStr(o)); + field = listCopy; + } + } + m.put(s, field); + }); + return m; + } + + /** Check if a list contains any Date objects */ + protected boolean hasDate(List<?> list) { + boolean hasDate = false; + for (Object o : list) { + if (o instanceof Date) { + hasDate = true; + break; + } + } + return hasDate; + } + + /** Convert Date objects to ISO formatted strings */ + protected Object constructDateStr(Object field) { + if (field instanceof Date) { + field = + DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date) field).getTime())); + } + return field; + } + abstract void start() throws IOException; @SuppressForbidden(reason = "Command line tool prints out to console") @@ -356,49 +409,12 @@ public class ExportTool extends ToolBase { @Override public synchronized void accept(SolrDocument doc) throws IOException { charArr.reset(); - Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size()); - doc.forEach( - (s, field) -> { - if (s.equals("_version_") || s.equals("_roor_")) return; - if (field instanceof List) { - if (((List<?>) field).size() == 1) { - field = ((List<?>) field).get(0); - } - } - field = constructDateStr(field); - if (field instanceof List<?> list) { - if (hasdate(list)) { - ArrayList<Object> listCopy = new ArrayList<>(list.size()); - for (Object o : list) listCopy.add(constructDateStr(o)); - field = listCopy; - } - } - m.put(s, field); - }); + Map<String, Object> m = processDocument(doc); jsonWriter.write(m); writer.write(charArr.getArray(), charArr.getStart(), charArr.getEnd()); writer.append('\n'); super.accept(doc); } - - private boolean hasdate(List<?> list) { - boolean hasDate = false; - for (Object o : list) { - if (o instanceof Date) { - hasDate = true; - break; - } - } - return hasDate; - } - - private Object constructDateStr(Object field) { - if (field instanceof Date) { - field = - DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date) field).getTime())); - } - return field; - } } static class JsonSink extends DocsSink { @@ -435,25 +451,7 @@ public class ExportTool extends ToolBase { @Override public synchronized void accept(SolrDocument doc) throws IOException { charArr.reset(); - Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size()); - doc.forEach( - (s, field) -> { - if (s.equals("_version_") || s.equals("_roor_")) return; - if (field instanceof List) { - if (((List<?>) field).size() == 1) { - field = ((List<?>) field).get(0); - } - } - field = constructDateStr(field); - if (field instanceof List<?> list) { - if (hasdate(list)) { - ArrayList<Object> listCopy = new ArrayList<>(list.size()); - for (Object o : list) listCopy.add(constructDateStr(o)); - field = listCopy; - } - } - m.put(s, field); - }); + Map<String, Object> m = processDocument(doc); if (firstDoc) { firstDoc = false; } else { @@ -464,25 +462,6 @@ public class ExportTool extends ToolBase { writer.append('\n'); super.accept(doc); } - - private boolean hasdate(List<?> list) { - boolean hasDate = false; - for (Object o : list) { - if (o instanceof Date) { - hasDate = true; - break; - } - } - return hasDate; - } - - private Object constructDateStr(Object field) { - if (field instanceof Date) { - field = - DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date) field).getTime())); - } - return field; - } } static class JavabinSink extends DocsSink { diff --git a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java index f74eedbb0d8..d356c439849 100644 --- a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java +++ b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java @@ -19,6 +19,7 @@ package org.apache.solr.cli; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.StandardCharsets; @@ -28,6 +29,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.Predicate; +import java.util.zip.GZIPInputStream; import org.apache.lucene.tests.util.TestUtil; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.client.solrj.SolrClient; @@ -50,6 +52,31 @@ import org.junit.Test; @SolrTestCaseJ4.SuppressSSL public class TestExportTool extends SolrCloudTestCase { + public void testOutputFormatToFileNameMapping() { + + ToolRuntime runtime = new CLITestHelper.TestingRuntime(false); + String url = "http://example:8983/solr/mycollection"; + ExportTool.Info info = new ExportTool.MultiThreadedRunner(runtime, url, null); + + info.setOutFormat(null, "json", false); + assertEquals("mycollection.json", info.out); + + info.setOutFormat(null, "jsonl", false); + assertEquals("mycollection.jsonl", info.out); + + info.setOutFormat(null, "javabin", false); + assertEquals("mycollection.javabin", info.out); + + String tempFile = createTempDir() + "/myoutput.json"; + info.setOutFormat(tempFile, "json", false); + assertEquals(tempFile, info.out); + + // test with compression + tempFile = createTempDir() + "/myoutput.myoutput.json.gz"; + info.setOutFormat(tempFile, "json", true); + assertEquals(tempFile, info.out); + } + @Test public void testBasic() throws Exception { String COLLECTION_NAME = "globalLoaderColl"; @@ -92,7 +119,8 @@ public class TestExportTool extends SolrCloudTestCase { info.fields = "id,desc_s,a_dt"; info.exportDocs(); - assertJsonDocsCount(info, 200, record -> "2019-09-30T05:58:03Z".equals(record.get("a_dt"))); + assertJsonLinesDocsCount( + info, 200, record -> "2019-09-30T05:58:03Z".equals(record.get("a_dt"))); info = new ExportTool.MultiThreadedRunner(runtime, url, null); absolutePath = @@ -102,7 +130,7 @@ public class TestExportTool extends SolrCloudTestCase { info.fields = "id,desc_s"; info.exportDocs(); - assertJsonDocsCount(info, 1000, null); + assertJsonLinesDocsCount(info, 1000, null); info = new ExportTool.MultiThreadedRunner(runtime, url, null); absolutePath = @@ -131,7 +159,7 @@ public class TestExportTool extends SolrCloudTestCase { info.fields = "id,desc_s"; info.exportDocs(); - assertJsonDocsCount2(info, 200); + assertJsonDocsCount(info, 200); info = new ExportTool.MultiThreadedRunner(runtime, url, null); absolutePath = @@ -141,7 +169,7 @@ public class TestExportTool extends SolrCloudTestCase { info.fields = "id,desc_s"; info.exportDocs(); - assertJsonDocsCount2(info, 1000); + assertJsonDocsCount(info, 1000); } finally { cluster.shutdown(); @@ -197,11 +225,9 @@ public class TestExportTool extends SolrCloudTestCase { assertEquals(docCount, totalDocsFromCores); ToolRuntime runtime = new CLITestHelper.TestingRuntime(false); - ExportTool.MultiThreadedRunner info; - String absolutePath; - info = new ExportTool.MultiThreadedRunner(runtime, url, null); - absolutePath = + ExportTool.MultiThreadedRunner info = new ExportTool.MultiThreadedRunner(runtime, url, null); + String absolutePath = baseDir.resolve(COLLECTION_NAME + random().nextInt(100000) + ".javabin").toString(); info.setOutFormat(absolutePath, "javabin", false); info.setLimit("-1"); @@ -211,6 +237,7 @@ public class TestExportTool extends SolrCloudTestCase { assertEquals( e.getValue().longValue(), info.corehandlers.get(e.getKey()).receivedDocs.get()); } + info = new ExportTool.MultiThreadedRunner(runtime, url, null); absolutePath = baseDir.resolve(COLLECTION_NAME + random().nextInt(100000) + ".jsonl").toString(); @@ -280,7 +307,7 @@ public class TestExportTool extends SolrCloudTestCase { } } - private void assertJsonDocsCount2(ExportTool.Info info, int expected) { + private void assertJsonDocsCount(ExportTool.Info info, int expected) { assertTrue( "" + info.docsWritten.get() + " expected " + expected, info.docsWritten.get() >= expected); } @@ -310,4 +337,34 @@ public class TestExportTool extends SolrCloudTestCase { rdr.close(); } } + + private void assertJsonLinesDocsCount( + ExportTool.Info info, int expected, Predicate<Map<String, Object>> predicate) + throws IOException { + assertTrue( + "" + info.docsWritten.get() + " expected " + expected, info.docsWritten.get() >= expected); + + JsonRecordReader jsonReader; + Reader rdr; + jsonReader = JsonRecordReader.getInst("/", List.of("$FQN:/**")); + InputStream is = new FileInputStream(info.out); + if (info.compress) { + is = new GZIPInputStream(is); + } + rdr = new InputStreamReader(is, StandardCharsets.UTF_8); + try { + int[] count = new int[] {0}; + jsonReader.streamRecords( + rdr, + (record, path) -> { + if (predicate != null) { + assertTrue(predicate.test(record)); + } + count[0]++; + }); + assertTrue(count[0] >= expected); + } finally { + rdr.close(); + } + } } diff --git a/solr/packaging/test/test_export.bats b/solr/packaging/test/test_export.bats index cb089ecd5fe..f497fde4839 100644 --- a/solr/packaging/test/test_export.bats +++ b/solr/packaging/test/test_export.bats @@ -45,9 +45,8 @@ teardown() { assert [ -e techproducts.javabin ] rm techproducts.javabin - # old pattern of putting a suffix on the output that controlled the format no longer supported ;-). - run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output.javabin" - assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.json ] + run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output.json" + assert [ -e ${BATS_TEST_TMPDIR}/output.json ] run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts --query "*:* -id:test" --output "${BATS_TEST_TMPDIR}" assert [ -e ${BATS_TEST_TMPDIR}/techproducts.json ] diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc index 48fe8e18347..842d294b06e 100644 --- a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc +++ b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc @@ -1619,12 +1619,12 @@ Examples of this command: === Exporting Documents to a File -The `export` command will allow you to export documents from a collection in JSON, JSON with Lines, or Javabin format. -All documents can be exported, or only those that match a query. +The `export` command will allow you to export documents from a collection in JSON, https://jsonlines.org/[JSON Lines], or Javabin format. +All documents can be exported, or only those that match a query. You may need to wrap some parameters with quotes. NOTE: This hasn't been tested with nested child documents and your results will vary. -NOTE: The `export` command only works with in a Solr running in cloud mode. +NOTE: The `export` command only works with Solr running in cloud mode. `bin/solr export [options]` @@ -1661,9 +1661,11 @@ Name of the collection to run an export against. |Optional |Default: `json` |=== + -The file format of the export, `json`, `jsonl`, or `javabin`. -Choosing `javabin` exports in the native Solr format, and is compact and fast to import. -`jsonl` is the Json with Lines format, learn more at https://jsonlines.org/. +The file format of the export, `json` (default) or `jsonl` or `javabin`, this also specifies the file extension to be used. +`json` and `jsonl` both export documents in the same format as using `wt=json`. The `json` output file is suitable for +immediately posting back to solr via the `/update/json` endpoint. `jsonl` outputs each Solr document on it's own line, +and it useful for parallel processing tasks. Learn more at https://jsonlines.org/. +Choosing `javabin` exports in the native binary Solr format and is compact and faster to import. `--output <path>`:: + @@ -1673,6 +1675,7 @@ Choosing `javabin` exports in the native Solr format, and is compact and fast to |=== + Either the path to the directory for the exported data to be written to, or a specific file to be written out. +If the file name ends with `.gz` the output will be compressed into a .gz file. + If only a directory is specified then the file will be created with the name of the collection, as in `<collection>.json`. @@ -1692,7 +1695,7 @@ If you specify `--compress` then the resulting outputting file with will be gzip |Optional |Default: `\*:*` |=== + -A custom query. +A custom query to select documents for exporting. The default is `\*:*` which will export all documents. `--fields <fields>`:: @@ -1727,7 +1730,7 @@ This parameter is unnecessary if `SOLR_AUTH_TYPE` is defined in `solr.in.sh` or *Examples* -Export all documents from a collection `gettingstarted`: +Export all documents from a collection `gettingstarted` into a file called `gettingstarted.json`: [source,bash] bin/solr export --solr-url http://localhost:8983 -c gettingstarted --limit -1 @@ -1741,7 +1744,12 @@ bin/solr export --solr-url http://localhost:8983 -c gettingstarted --limit -1 -- === Importing Documents into a Collection -Once you have exported documents in a file, you can use the xref:indexing-guide:indexing-with-update-handlers.adoc[/update request handler] to import them to a new Solr collection. +Once you have exported documents in a file, you can use the xref:indexing-guide:indexing-with-update-handlers.adoc#json-formatted-index-updates[/update request handler] to import them to a new Solr collection. +Notice the different endpoints used depending on the format. + +*Example: import `json` files* + +`curl -X POST --header "Content-Type: application/json" -d @gettingstarted.json http://localhost:8983/solr/gettingstarted/update/json?commit=true` *Example: import `json` files* @@ -1763,7 +1771,7 @@ Now import the data with either of these methods: [,console] ---- -$ curl -X POST -d @gettingstarted.json 'http://localhost:8983/solr/test_collection/update/json/docs?commit=true' +$ curl -X POST --header "Content-Type: application/json" -d @gettingstarted.json 'http://localhost:8983/solr/test_collection/update/json/docs?commit=true' ---- or [,console]