This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 4166a6d60f perf: Optimize comparison on nested types (#20716)
4166a6d60f is described below
commit 4166a6d60f531b47826c2ed554e08a6f5afd7ca1
Author: Neil Conway <[email protected]>
AuthorDate: Mon Mar 16 13:43:48 2026 -0700
perf: Optimize comparison on nested types (#20716)
## Which issue does this PR close?
N/A
## Rationale for this change
`BooleanBuffer::collect_bool()` is faster than `map().collect()`. Per
discussion on #20694; originally suggested by @Dandandan.
## What changes are included in this PR?
- Implement optimization
- Add benchmark for nested struct comparison
## Are these changes tested?
Yes, covered by existing tests.
## Are there any user-facing changes?
No.
## AI usage
Multiple AI tools were used to iterate on this PR. I have reviewed and
understand the resulting code.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
Cargo.lock | 2 +
datafusion/physical-expr-common/Cargo.toml | 8 +++
.../physical-expr-common/benches/compare_nested.rs | 74 ++++++++++++++++++++++
datafusion/physical-expr-common/src/datum.rs | 8 +--
4 files changed, 88 insertions(+), 4 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 35660359ce..632c82be5a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2425,12 +2425,14 @@ dependencies = [
"ahash",
"arrow",
"chrono",
+ "criterion",
"datafusion-common",
"datafusion-expr-common",
"hashbrown 0.16.1",
"indexmap 2.13.0",
"itertools 0.14.0",
"parking_lot",
+ "rand 0.9.2",
]
[[package]]
diff --git a/datafusion/physical-expr-common/Cargo.toml
b/datafusion/physical-expr-common/Cargo.toml
index a81eafe196..453c8a0cb4 100644
--- a/datafusion/physical-expr-common/Cargo.toml
+++ b/datafusion/physical-expr-common/Cargo.toml
@@ -50,3 +50,11 @@ hashbrown = { workspace = true }
indexmap = { workspace = true }
itertools = { workspace = true }
parking_lot = { workspace = true }
+
+[dev-dependencies]
+criterion = { workspace = true }
+rand = { workspace = true }
+
+[[bench]]
+harness = false
+name = "compare_nested"
diff --git a/datafusion/physical-expr-common/benches/compare_nested.rs
b/datafusion/physical-expr-common/benches/compare_nested.rs
new file mode 100644
index 0000000000..56c122fef9
--- /dev/null
+++ b/datafusion/physical-expr-common/benches/compare_nested.rs
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray};
+use arrow::datatypes::{DataType, Field, Fields};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr_common::operator::Operator;
+use datafusion_physical_expr_common::datum::compare_op_for_nested;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Build a StructArray with fields {x: Int32, y: Utf8}.
+fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef {
+ let ints: Int32Array = (0..num_rows).map(|_|
Some(rng.random::<i32>())).collect();
+
+ let strings: StringArray = (0..num_rows)
+ .map(|_| {
+ let s: String = (0..12)
+ .map(|_| rng.random_range(b'a'..=b'z') as char)
+ .collect();
+ Some(s)
+ })
+ .collect();
+
+ let fields = Fields::from(vec![
+ Field::new("x", DataType::Int32, false),
+ Field::new("y", DataType::Utf8, false),
+ ]);
+
+ Arc::new(
+ StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)],
None)
+ .unwrap(),
+ )
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+ let num_rows = 8192;
+ let mut rng = StdRng::seed_from_u64(42);
+
+ let lhs = make_struct_array(num_rows, &mut rng);
+ let rhs_array = make_struct_array(num_rows, &mut rng);
+ let rhs_scalar = Scalar::new(make_struct_array(1, &mut rng));
+
+ c.bench_function("compare_nested array_array", |b| {
+ b.iter(|| {
+ black_box(compare_op_for_nested(Operator::Eq, &lhs,
&rhs_array).unwrap())
+ })
+ });
+
+ c.bench_function("compare_nested array_scalar", |b| {
+ b.iter(|| {
+ black_box(compare_op_for_nested(Operator::Eq, &lhs,
&rhs_scalar).unwrap())
+ })
+ });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/physical-expr-common/src/datum.rs
b/datafusion/physical-expr-common/src/datum.rs
index 9efaca0f6b..bd5790507f 100644
--- a/datafusion/physical-expr-common/src/datum.rs
+++ b/datafusion/physical-expr-common/src/datum.rs
@@ -17,7 +17,7 @@
use arrow::array::BooleanArray;
use arrow::array::{ArrayRef, Datum, make_comparator};
-use arrow::buffer::NullBuffer;
+use arrow::buffer::{BooleanBuffer, NullBuffer};
use arrow::compute::kernels::cmp::{
distinct, eq, gt, gt_eq, lt, lt_eq, neq, not_distinct,
};
@@ -171,9 +171,9 @@ pub fn compare_op_for_nested(
};
let values = match (is_l_scalar, is_r_scalar) {
- (false, false) => (0..len).map(|i| cmp_with_op(i, i)).collect(),
- (true, false) => (0..len).map(|i| cmp_with_op(0, i)).collect(),
- (false, true) => (0..len).map(|i| cmp_with_op(i, 0)).collect(),
+ (false, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i,
i)),
+ (true, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(0,
i)),
+ (false, true) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i,
0)),
(true, true) => std::iter::once(cmp_with_op(0, 0)).collect(),
};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]