(datafusion) branch main updated: Add flat vs. struct field projection benchmarks (#21257)

github-bot Mon, 30 Mar 2026 13:27:57 -0700

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new ccaf8022da Add flat vs. struct field projection benchmarks (#21257)
ccaf8022da is described below

commit ccaf8022da368052d8a1d910c28ccc8af8422e03
Author: Matthew Kim <[email protected]>
AuthorDate: Mon Mar 30 16:27:08 2026 -0400

    Add flat vs. struct field projection benchmarks (#21257)
    
    ## Rationale for this change
    
    This PR adds a benchmark comparing top-level column access against
    struct field access for the same logical data
    
    #20925 introduced leaf level projection masking so that projecting a
    single struct field skips decoding its siblings. #21180 added benchmarks
    measuring that improvement across different strcut shapes. But neither
    benchmark answers how struct field access compare to reading the same
    column at the top level. Without that baseline, it's hard to know how
    much overhead the struct access path itself adds
---
 .../core/benches/parquet_struct_projection.rs      | 85 +++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/benches/parquet_struct_projection.rs 
b/datafusion/core/benches/parquet_struct_projection.rs
index 65b3905da8..7d5b220d39 100644
--- a/datafusion/core/benches/parquet_struct_projection.rs
+++ b/datafusion/core/benches/parquet_struct_projection.rs
@@ -404,10 +404,93 @@ fn nested_benchmarks(c: &mut Criterion) {
     drop(temp_file);
 }
 
+fn flat_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("large_string", DataType::Utf8, false),
+        Field::new("small_int", DataType::Int32, false),
+    ]))
+}
+
+fn flat_batch(batch_id: usize) -> RecordBatch {
+    let schema = flat_schema();
+    let len = WRITE_RECORD_BATCH_SIZE;
+
+    let base_id = (batch_id * len) as i32;
+    let id_values: Vec<i32> = (0..len).map(|i| base_id + i as i32).collect();
+    let id_array = Arc::new(Int32Array::from(id_values.clone()));
+    let small_int_array = Arc::new(Int32Array::from(id_values));
+
+    let large_string: String = "x".repeat(LARGE_STRING_LEN);
+    let mut string_builder = StringBuilder::new();
+    for _ in 0..len {
+        string_builder.append_value(&large_string);
+    }
+    let large_string_array = Arc::new(string_builder.finish());
+
+    RecordBatch::try_new(
+        schema,
+        vec![id_array, large_string_array as ArrayRef, small_int_array],
+    )
+    .unwrap()
+}
+
+/// Compare selecting a small field from a flat (top-level) schema vs from
+/// inside a struct. Both files contain the same logical data — the only
+/// difference is whether `small_int` lives at the top level or nested inside
+/// a struct column.
+fn flat_vs_struct_benchmarks(c: &mut Criterion) {
+    let flat_file = generate_file(flat_schema(), flat_batch, "flat");
+    let flat_path = flat_file.path().display().to_string();
+    assert!(Path::new(&flat_path).exists(), "path not found");
+
+    let struct_file = generate_file(narrow_schema(), narrow_batch, 
"narrow_struct_cmp");
+    let struct_path = struct_file.path().display().to_string();
+    assert!(Path::new(&struct_path).exists(), "path not found");
+
+    let rt = Runtime::new().unwrap();
+    let flat_ctx = create_context(&rt, &flat_path, "t");
+    let struct_ctx = create_context(&rt, &struct_path, "t");
+
+    let mut group = c.benchmark_group("flat_vs_struct");
+    group.sample_size(10);
+    group.warm_up_time(Duration::from_secs(1));
+    group.measurement_time(Duration::from_secs(2));
+
+    // small int: top-level vs struct field
+    group.bench_function("flat_select_small_int", |b| {
+        b.iter(|| query(&flat_ctx, &rt, "SELECT small_int FROM t"))
+    });
+    group.bench_function("struct_select_small_int", |b| {
+        b.iter(|| query(&struct_ctx, &rt, "SELECT s['small_int'] FROM t"))
+    });
+
+    // large string: top-level vs struct field
+    group.bench_function("flat_select_large_string", |b| {
+        b.iter(|| query(&flat_ctx, &rt, "SELECT large_string FROM t"))
+    });
+    group.bench_function("struct_select_large_string", |b| {
+        b.iter(|| query(&struct_ctx, &rt, "SELECT s['large_string'] FROM t"))
+    });
+
+    // aggregation: SUM of small int
+    group.bench_function("flat_sum_small_int", |b| {
+        b.iter(|| query(&flat_ctx, &rt, "SELECT SUM(small_int) FROM t"))
+    });
+    group.bench_function("struct_sum_small_int", |b| {
+        b.iter(|| query(&struct_ctx, &rt, "SELECT SUM(s['small_int']) FROM t"))
+    });
+
+    group.finish();
+    drop(flat_file);
+    drop(struct_file);
+}
+
 criterion_group!(
     benches,
     narrow_benchmarks,
     wide_benchmarks,
-    nested_benchmarks
+    nested_benchmarks,
+    flat_vs_struct_benchmarks,
 );
 criterion_main!(benches);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Add flat vs. struct field projection benchmarks (#21257)

Reply via email to