geoffreyclaude commented on code in PR #18832:
URL: https://github.com/apache/datafusion/pull/18832#discussion_r2603365909
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -198,65 +248,414 @@ impl ArrayStaticFilter {
}
}
-struct Int32StaticFilter {
- null_count: usize,
- values: HashSet<i32>,
+/// Threshold for switching from sorted Vec (binary search) to HashSet
+/// For small lists, binary search has better cache locality and lower overhead
+/// Maximum list size for using sorted lookup (binary search).
+/// Lists with more elements use hash lookup instead.
+const SORTED_LOOKUP_MAX_LEN: usize = 8;
+
+/// Helper to build a BooleanArray result for IN list operations.
+/// Handles SQL three-valued logic for NULL values.
+///
+/// # Arguments
+/// * `len` - Number of elements in the needle array
+/// * `needle_nulls` - Optional validity buffer from the needle array
+/// * `haystack_has_nulls` - Whether the IN list contains NULL values
+/// * `negated` - Whether this is a NOT IN operation
+/// * `contains` - Closure that returns whether needle[i] is found in the
haystack
+#[inline]
+fn build_in_list_result<C>(
+ len: usize,
+ needle_nulls: Option<&NullBuffer>,
+ haystack_has_nulls: bool,
+ negated: bool,
+ contains: C,
+) -> BooleanArray
+where
+ C: Fn(usize) -> bool,
+{
+ // Use collect_bool for all paths - it's vectorized and faster than
element-by-element append.
+ // Match on (needle_has_nulls, haystack_has_nulls, negated) to specialize
each case.
+ match (needle_nulls, haystack_has_nulls, negated) {
+ // Haystack has nulls: result is NULL when not found (might match the
NULL)
+ // values_buf == nulls_buf, so compute once and clone
+ (Some(validity), true, false) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let buf = validity.inner() & &contains_buf;
+ BooleanArray::new(buf.clone(), Some(NullBuffer::new(buf)))
+ }
+ (None, true, false) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| contains(i));
+ BooleanArray::new(buf.clone(), Some(NullBuffer::new(buf)))
+ }
+ (Some(validity), true, true) => {
+ // Compute nulls_buf via SIMD AND, then derive values_buf via XOR.
+ // Uses identity: A & !B = A ^ (A & B) to get values from nulls.
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let nulls_buf = validity.inner() & &contains_buf;
+ let values_buf = validity.inner() ^ &nulls_buf; // valid &
!contains
+ BooleanArray::new(values_buf, Some(NullBuffer::new(nulls_buf)))
+ }
+ (None, true, true) => {
+ // No needle nulls, but haystack has nulls
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = !&contains_buf;
+ BooleanArray::new(values_buf, Some(NullBuffer::new(contains_buf)))
+ }
+ // Only needle has nulls: nulls_buf is just validity (reuse it
directly!)
+ (Some(validity), false, false) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = validity.inner() & &contains_buf;
+ BooleanArray::new(values_buf, Some(validity.clone()))
+ }
+ (Some(validity), false, true) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = validity.inner() & &(!&contains_buf);
+ BooleanArray::new(values_buf, Some(validity.clone()))
+ }
+ // No nulls anywhere: no validity buffer needed
+ (None, false, false) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| contains(i));
+ BooleanArray::new(buf, None)
+ }
+ (None, false, true) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| !contains(i));
+ BooleanArray::new(buf, None)
+ }
+ }
}
-impl Int32StaticFilter {
- fn try_new(in_array: &ArrayRef) -> Result<Self> {
- let in_array = in_array
- .as_primitive_opt::<Int32Type>()
- .ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?;
+/// Sorted lookup using binary search. Best for small lists (< 8 elements).
+struct SortedLookup<T>(Vec<T>);
- let mut values = HashSet::with_capacity(in_array.len());
- let null_count = in_array.null_count();
+impl<T: Ord> SortedLookup<T> {
+ fn new(mut values: Vec<T>) -> Self {
+ values.sort_unstable();
+ values.dedup();
+ Self(values)
+ }
- for v in in_array.iter().flatten() {
- values.insert(v);
- }
+ #[inline]
+ fn contains(&self, value: &T) -> bool {
+ self.0.binary_search(value).is_ok()
Review Comment:
@Dandandan and @adriangb: I opened a PR to `pydantic` at
https://github.com/pydantic/datafusion/pull/48 as a POC with the bench numbers
on my mac.
It shows another ~70% improvement on short in-lists of primitive types, and
~40% on short utf8view in-lists as well.
This is a pretty major refactor, so I'm very much in favor of "freezing" the
features of this PR and leaving my further improvement to later!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]