alamb commented on code in PR #20823:
URL: https://github.com/apache/datafusion/pull/20823#discussion_r3046452282
##########
datafusion/core/tests/datasource/object_store_access.rs:
##########
@@ -397,6 +400,348 @@ async fn query_partitioned_csv_file() {
);
}
+// =====================================================================
+// JSON (NDJSON) tests — mirrors the CSV tests above
+// =====================================================================
+
+#[tokio::test]
+async fn create_single_json_file() {
+ let test = Test::new().with_single_file_json().await;
+ assert_snapshot!(
+ test.requests(),
+ @r"
+ RequestCountingObjectStore()
+ Total Requests: 2
+ - GET (opts) path=json_table.json head=true
+ - GET (opts) path=json_table.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_single_json_file() {
+ let test = Test::new().with_single_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.00001 | 5e-12 | true |
+ | 0.00002 | 4e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 2
+ - GET (opts) path=json_table.json head=true
+ - GET (opts) path=json_table.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn create_multi_file_json() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.requests(),
+ @r"
+ RequestCountingObjectStore()
+ Total Requests: 4
+ - LIST prefix=data
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn multi_query_multi_file_json() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+
+ // Force a cache eviction by removing the data limit for the cache
+ assert_snapshot!(
+ test.query("set
datafusion.runtime.list_files_cache_limit=\"0K\"").await,
+ @r"
+ ------- Query Output (0 rows) -------
+ ++
+ ++
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 0
+ "
+ );
+
+ // Then re-enable the cache
+ assert_snapshot!(
+ test.query("set
datafusion.runtime.list_files_cache_limit=\"1M\"").await,
+ @r"
+ ------- Query Output (0 rows) -------
+ ++
+ ++
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 0
+ "
+ );
+
+ // this query should list the table since the cache entries were evicted
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 4
+ - LIST prefix=data
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+
+ // this query should not list the table since the entries were added in
the previous query
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_multi_json_file() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_partitioned_json_file() {
+ let test = Test::new().with_partitioned_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00001 | 1e-12 | true | 1 | 10 | 100 |
+ | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ | 0.00003 | 3e-12 | true | 3 | 30 | 300 |
+ | 0.00003 | 5e-12 | false | 3 | 30 | 300 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/a=1/b=10/c=100/file_1.json
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ - GET (opts) path=data/a=3/b=30/c=300/file_3.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a=2").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE b=20").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE c=200").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a=2 AND
b=20").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a<2 AND b=10
AND c=100").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00001 | 1e-12 | true | 1 | 10 | 100 |
+ | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=1/b=10/c=100/file_1.json
+ "
+ );
+}
+
+/// Test that a JSON file split into byte ranges via repartitioning produces
+/// exactly one GET request per byte range — no extra requests for boundary
seeking.
+///
+/// With a single file and `target_partitions=3`, the repartitioner produces
+/// exactly 3 ranges. Each range is served by a single
[`AlignedBoundaryStream`]
+/// which issues exactly one bounded `get_opts` call, so there are 3 data GETs
+/// plus 1 HEAD (to determine file size) = **4 total**.
+///
+/// This differs from the CSV reader, which needs multiple GETs per range.
Review Comment:
This would be a nice follow on PR (add CSV tests to document the current
behavior). Maybe someone wants to do that
##########
datafusion/core/tests/datasource/object_store_access.rs:
##########
@@ -397,6 +400,348 @@ async fn query_partitioned_csv_file() {
);
}
+// =====================================================================
+// JSON (NDJSON) tests — mirrors the CSV tests above
+// =====================================================================
+
+#[tokio::test]
+async fn create_single_json_file() {
+ let test = Test::new().with_single_file_json().await;
+ assert_snapshot!(
+ test.requests(),
+ @r"
+ RequestCountingObjectStore()
+ Total Requests: 2
+ - GET (opts) path=json_table.json head=true
+ - GET (opts) path=json_table.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_single_json_file() {
+ let test = Test::new().with_single_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.00001 | 5e-12 | true |
+ | 0.00002 | 4e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 2
+ - GET (opts) path=json_table.json head=true
+ - GET (opts) path=json_table.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn create_multi_file_json() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.requests(),
+ @r"
+ RequestCountingObjectStore()
+ Total Requests: 4
+ - LIST prefix=data
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn multi_query_multi_file_json() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+
+ // Force a cache eviction by removing the data limit for the cache
+ assert_snapshot!(
+ test.query("set
datafusion.runtime.list_files_cache_limit=\"0K\"").await,
+ @r"
+ ------- Query Output (0 rows) -------
+ ++
+ ++
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 0
+ "
+ );
+
+ // Then re-enable the cache
+ assert_snapshot!(
+ test.query("set
datafusion.runtime.list_files_cache_limit=\"1M\"").await,
+ @r"
+ ------- Query Output (0 rows) -------
+ ++
+ ++
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 0
+ "
+ );
+
+ // this query should list the table since the cache entries were evicted
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 4
+ - LIST prefix=data
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+
+ // this query should not list the table since the entries were added in
the previous query
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_multi_json_file() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_partitioned_json_file() {
+ let test = Test::new().with_partitioned_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00001 | 1e-12 | true | 1 | 10 | 100 |
+ | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ | 0.00003 | 3e-12 | true | 3 | 30 | 300 |
+ | 0.00003 | 5e-12 | false | 3 | 30 | 300 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/a=1/b=10/c=100/file_1.json
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ - GET (opts) path=data/a=3/b=30/c=300/file_3.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a=2").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE b=20").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE c=200").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a=2 AND
b=20").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a<2 AND b=10
AND c=100").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00001 | 1e-12 | true | 1 | 10 | 100 |
+ | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=1/b=10/c=100/file_1.json
+ "
+ );
+}
+
+/// Test that a JSON file split into byte ranges via repartitioning produces
+/// exactly one GET request per byte range — no extra requests for boundary
seeking.
Review Comment:
👍
##########
datafusion/core/tests/datasource/object_store_access.rs:
##########
@@ -397,6 +400,348 @@ async fn query_partitioned_csv_file() {
);
}
+// =====================================================================
+// JSON (NDJSON) tests — mirrors the CSV tests above
+// =====================================================================
+
+#[tokio::test]
+async fn create_single_json_file() {
+ let test = Test::new().with_single_file_json().await;
+ assert_snapshot!(
+ test.requests(),
+ @r"
+ RequestCountingObjectStore()
+ Total Requests: 2
+ - GET (opts) path=json_table.json head=true
+ - GET (opts) path=json_table.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_single_json_file() {
+ let test = Test::new().with_single_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.00001 | 5e-12 | true |
+ | 0.00002 | 4e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 2
+ - GET (opts) path=json_table.json head=true
+ - GET (opts) path=json_table.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn create_multi_file_json() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.requests(),
+ @r"
+ RequestCountingObjectStore()
+ Total Requests: 4
+ - LIST prefix=data
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn multi_query_multi_file_json() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+
+ // Force a cache eviction by removing the data limit for the cache
+ assert_snapshot!(
+ test.query("set
datafusion.runtime.list_files_cache_limit=\"0K\"").await,
+ @r"
+ ------- Query Output (0 rows) -------
+ ++
+ ++
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 0
+ "
+ );
+
+ // Then re-enable the cache
+ assert_snapshot!(
+ test.query("set
datafusion.runtime.list_files_cache_limit=\"1M\"").await,
+ @r"
+ ------- Query Output (0 rows) -------
+ ++
+ ++
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 0
+ "
+ );
+
+ // this query should list the table since the cache entries were evicted
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 4
+ - LIST prefix=data
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+
+ // this query should not list the table since the entries were added in
the previous query
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_multi_json_file() {
+ let test = Test::new().with_multi_file_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.json
+ - GET (opts) path=data/file_1.json
+ - GET (opts) path=data/file_2.json
+ "
+ );
+}
+
+#[tokio::test]
+async fn query_partitioned_json_file() {
+ let test = Test::new().with_partitioned_json().await;
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00001 | 1e-12 | true | 1 | 10 | 100 |
+ | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ | 0.00003 | 3e-12 | true | 3 | 30 | 300 |
+ | 0.00003 | 5e-12 | false | 3 | 30 | 300 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/a=1/b=10/c=100/file_1.json
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ - GET (opts) path=data/a=3/b=30/c=300/file_3.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a=2").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE b=20").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE c=200").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a=2 AND
b=20").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00002 | 2e-12 | true | 2 | 20 | 200 |
+ | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=2/b=20/c=200/file_2.json
+ "
+ );
+
+ assert_snapshot!(
+ test.query("select * from json_table_partitioned WHERE a<2 AND b=10
AND c=100").await,
+ @r"
+ ------- Query Output (2 rows) -------
+ +---------+-------+-------+---+----+-----+
+ | d1 | d2 | d3 | a | b | c |
+ +---------+-------+-------+---+----+-----+
+ | 0.00001 | 1e-12 | true | 1 | 10 | 100 |
+ | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+ +---------+-------+-------+---+----+-----+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 1
+ - GET (opts) path=data/a=1/b=10/c=100/file_1.json
+ "
+ );
+}
+
+/// Test that a JSON file split into byte ranges via repartitioning produces
+/// exactly one GET request per byte range — no extra requests for boundary
seeking.
+///
+/// With a single file and `target_partitions=3`, the repartitioner produces
+/// exactly 3 ranges. Each range is served by a single
[`AlignedBoundaryStream`]
+/// which issues exactly one bounded `get_opts` call, so there are 3 data GETs
+/// plus 1 HEAD (to determine file size) = **4 total**.
+///
+/// This differs from the CSV reader, which needs multiple GETs per range.
+///
+/// This test documents the current request pattern to catch regressions.
+#[tokio::test]
+async fn query_json_file_with_byte_range_partitions() {
Review Comment:
I ran this test locally without the code changes in this PR and it fails
like this (as expected):
```
• === query_json_file_with_byte_range_partitions ===
Finished `test` profile [unoptimized + debuginfo] target(s) in 0.15s
Running tests/core_integration.rs
(target/debug/deps/core_integration-13d29de1dea6b31c)
running 1 test
test
datasource::object_store_access::query_json_file_with_byte_range_partitions ...
FAILED
failures:
----
datasource::object_store_access::query_json_file_with_byte_range_partitions
stdout ----
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Snapshot Summary
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Snapshot: query_json_file_with_byte_range_partitions
Source: datafusion/core/tests/datasource/object_store_access.rs:720
────────────────────────────────────────────────────────────────────────────────
Expression: test.query("select * from json_range_table").await
────────────────────────────────────────────────────────────────────────────────
-old snapshot
+new results
────────────┬───────────────────────────────────────────────────────────────────
10 10 │ | 0.00006 | 6e-12 | true |
11 11 │ +---------+-------+------+
12 12 │ ------- Object Store Request Summary -------
13 13 │ RequestCountingObjectStore()
14 │-Total Requests: 4
14 │+Total Requests: 8
15 15 │ - GET (opts) path=json_range_table.json head=true
16 │-- GET (opts) path=json_range_table.json range=0-216
17 16 │ - GET (opts) path=json_range_table.json range=71-216
18 │-- GET (opts) path=json_range_table.json range=143-216
17 │+- GET (opts) path=json_range_table.json range=0-72
18 │+- GET (opts) path=json_range_table.json range=71-216
19 │+- GET (opts) path=json_range_table.json range=143-216
20 │+- GET (opts) path=json_range_table.json range=72-144
21 │+- GET (opts) path=json_range_table.json range=143-216
22 │+- GET (opts) path=json_range_table.json range=144-216
────────────┴───────────────────────────────────────────────────────────────────
To update snapshots run `cargo insta review`
Stopped on the first failure. Run `cargo insta test` to run all snapshots.
thread
'datasource::object_store_access::query_json_file_with_byte_range_partitions'
(24436221) panicked at /Users/andrewlamb/.cargo/registry/src/index.crates.io-
1949cf8c6b5b557f/insta-1.47.2/src/runtime.rs:719:13:
snapshot assertion for 'query_json_file_with_byte_range_partitions' failed
in line 720
note: run with `RUST_BACKTRACE=1` environment variable to display a
backtrace
failures:
datasource::object_store_access::query_json_file_with_byte_range_partitions
test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 934
filtered out; finished in 0.13s
error: test failed, to rerun pass `-p datafusion --test core_integration`
```
Codex summarized this nicely:
- PR expectation: 4 requests total
- head=true
- range=0-216
- range=71-216
- range=143-216
- main actual: 8 requests total
- head=true
- range=71-216
- range=0-72
- range=71-216
- range=143-216
- range=72-144
- range=143-216
- range=144-216
##########
datafusion/core/tests/datasource/object_store_access.rs:
##########
@@ -397,6 +400,348 @@ async fn query_partitioned_csv_file() {
);
}
+// =====================================================================
+// JSON (NDJSON) tests — mirrors the CSV tests above
+// =====================================================================
+
+#[tokio::test]
+async fn create_single_json_file() {
+ let test = Test::new().with_single_file_json().await;
+ assert_snapshot!(
+ test.requests(),
+ @r"
+ RequestCountingObjectStore()
+ Total Requests: 2
+ - GET (opts) path=json_table.json head=true
+ - GET (opts) path=json_table.json
+ "
+ );
+}
+
+#[tokio::test]
Review Comment:
these tests are probably not strictly necessary as they cover the same
behavior as is tested in the parquet opener. However, I think having the
additional coverage is a good idea in case we ever specialize the opening
process more.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]