geoffreyclaude commented on code in PR #18832:
URL: https://github.com/apache/datafusion/pull/18832#discussion_r2603681481
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -198,65 +248,414 @@ impl ArrayStaticFilter {
}
}
-struct Int32StaticFilter {
- null_count: usize,
- values: HashSet<i32>,
+/// Threshold for switching from sorted Vec (binary search) to HashSet
+/// For small lists, binary search has better cache locality and lower overhead
+/// Maximum list size for using sorted lookup (binary search).
+/// Lists with more elements use hash lookup instead.
+const SORTED_LOOKUP_MAX_LEN: usize = 8;
+
+/// Helper to build a BooleanArray result for IN list operations.
+/// Handles SQL three-valued logic for NULL values.
+///
+/// # Arguments
+/// * `len` - Number of elements in the needle array
+/// * `needle_nulls` - Optional validity buffer from the needle array
+/// * `haystack_has_nulls` - Whether the IN list contains NULL values
+/// * `negated` - Whether this is a NOT IN operation
+/// * `contains` - Closure that returns whether needle[i] is found in the
haystack
+#[inline]
+fn build_in_list_result<C>(
+ len: usize,
+ needle_nulls: Option<&NullBuffer>,
+ haystack_has_nulls: bool,
+ negated: bool,
+ contains: C,
+) -> BooleanArray
+where
+ C: Fn(usize) -> bool,
+{
+ // Use collect_bool for all paths - it's vectorized and faster than
element-by-element append.
+ // Match on (needle_has_nulls, haystack_has_nulls, negated) to specialize
each case.
+ match (needle_nulls, haystack_has_nulls, negated) {
+ // Haystack has nulls: result is NULL when not found (might match the
NULL)
+ // values_buf == nulls_buf, so compute once and clone
+ (Some(validity), true, false) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let buf = validity.inner() & &contains_buf;
+ BooleanArray::new(buf.clone(), Some(NullBuffer::new(buf)))
+ }
+ (None, true, false) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| contains(i));
+ BooleanArray::new(buf.clone(), Some(NullBuffer::new(buf)))
+ }
+ (Some(validity), true, true) => {
+ // Compute nulls_buf via SIMD AND, then derive values_buf via XOR.
+ // Uses identity: A & !B = A ^ (A & B) to get values from nulls.
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let nulls_buf = validity.inner() & &contains_buf;
+ let values_buf = validity.inner() ^ &nulls_buf; // valid &
!contains
+ BooleanArray::new(values_buf, Some(NullBuffer::new(nulls_buf)))
+ }
+ (None, true, true) => {
+ // No needle nulls, but haystack has nulls
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = !&contains_buf;
+ BooleanArray::new(values_buf, Some(NullBuffer::new(contains_buf)))
+ }
+ // Only needle has nulls: nulls_buf is just validity (reuse it
directly!)
+ (Some(validity), false, false) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = validity.inner() & &contains_buf;
+ BooleanArray::new(values_buf, Some(validity.clone()))
+ }
+ (Some(validity), false, true) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = validity.inner() & &(!&contains_buf);
+ BooleanArray::new(values_buf, Some(validity.clone()))
+ }
+ // No nulls anywhere: no validity buffer needed
+ (None, false, false) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| contains(i));
+ BooleanArray::new(buf, None)
+ }
+ (None, false, true) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| !contains(i));
+ BooleanArray::new(buf, None)
+ }
+ }
}
-impl Int32StaticFilter {
- fn try_new(in_array: &ArrayRef) -> Result<Self> {
- let in_array = in_array
- .as_primitive_opt::<Int32Type>()
- .ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?;
+/// Sorted lookup using binary search. Best for small lists (< 8 elements).
+struct SortedLookup<T>(Vec<T>);
- let mut values = HashSet::with_capacity(in_array.len());
- let null_count = in_array.null_count();
+impl<T: Ord> SortedLookup<T> {
+ fn new(mut values: Vec<T>) -> Self {
+ values.sort_unstable();
+ values.dedup();
+ Self(values)
+ }
- for v in in_array.iter().flatten() {
- values.insert(v);
- }
+ #[inline]
+ fn contains(&self, value: &T) -> bool {
+ self.0.binary_search(value).is_ok()
Review Comment:
Yes, you should probably just revert my commit
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -198,65 +248,414 @@ impl ArrayStaticFilter {
}
}
-struct Int32StaticFilter {
- null_count: usize,
- values: HashSet<i32>,
+/// Threshold for switching from sorted Vec (binary search) to HashSet
+/// For small lists, binary search has better cache locality and lower overhead
+/// Maximum list size for using sorted lookup (binary search).
+/// Lists with more elements use hash lookup instead.
+const SORTED_LOOKUP_MAX_LEN: usize = 8;
+
+/// Helper to build a BooleanArray result for IN list operations.
+/// Handles SQL three-valued logic for NULL values.
+///
+/// # Arguments
+/// * `len` - Number of elements in the needle array
+/// * `needle_nulls` - Optional validity buffer from the needle array
+/// * `haystack_has_nulls` - Whether the IN list contains NULL values
+/// * `negated` - Whether this is a NOT IN operation
+/// * `contains` - Closure that returns whether needle[i] is found in the
haystack
+#[inline]
+fn build_in_list_result<C>(
+ len: usize,
+ needle_nulls: Option<&NullBuffer>,
+ haystack_has_nulls: bool,
+ negated: bool,
+ contains: C,
+) -> BooleanArray
+where
+ C: Fn(usize) -> bool,
+{
+ // Use collect_bool for all paths - it's vectorized and faster than
element-by-element append.
+ // Match on (needle_has_nulls, haystack_has_nulls, negated) to specialize
each case.
+ match (needle_nulls, haystack_has_nulls, negated) {
+ // Haystack has nulls: result is NULL when not found (might match the
NULL)
+ // values_buf == nulls_buf, so compute once and clone
+ (Some(validity), true, false) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let buf = validity.inner() & &contains_buf;
+ BooleanArray::new(buf.clone(), Some(NullBuffer::new(buf)))
+ }
+ (None, true, false) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| contains(i));
+ BooleanArray::new(buf.clone(), Some(NullBuffer::new(buf)))
+ }
+ (Some(validity), true, true) => {
+ // Compute nulls_buf via SIMD AND, then derive values_buf via XOR.
+ // Uses identity: A & !B = A ^ (A & B) to get values from nulls.
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let nulls_buf = validity.inner() & &contains_buf;
+ let values_buf = validity.inner() ^ &nulls_buf; // valid &
!contains
+ BooleanArray::new(values_buf, Some(NullBuffer::new(nulls_buf)))
+ }
+ (None, true, true) => {
+ // No needle nulls, but haystack has nulls
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = !&contains_buf;
+ BooleanArray::new(values_buf, Some(NullBuffer::new(contains_buf)))
+ }
+ // Only needle has nulls: nulls_buf is just validity (reuse it
directly!)
+ (Some(validity), false, false) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = validity.inner() & &contains_buf;
+ BooleanArray::new(values_buf, Some(validity.clone()))
+ }
+ (Some(validity), false, true) => {
+ let contains_buf = BooleanBuffer::collect_bool(len, |i|
contains(i));
+ let values_buf = validity.inner() & &(!&contains_buf);
+ BooleanArray::new(values_buf, Some(validity.clone()))
+ }
+ // No nulls anywhere: no validity buffer needed
+ (None, false, false) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| contains(i));
+ BooleanArray::new(buf, None)
+ }
+ (None, false, true) => {
+ let buf = BooleanBuffer::collect_bool(len, |i| !contains(i));
+ BooleanArray::new(buf, None)
+ }
+ }
}
-impl Int32StaticFilter {
- fn try_new(in_array: &ArrayRef) -> Result<Self> {
- let in_array = in_array
- .as_primitive_opt::<Int32Type>()
- .ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?;
+/// Sorted lookup using binary search. Best for small lists (< 8 elements).
+struct SortedLookup<T>(Vec<T>);
- let mut values = HashSet::with_capacity(in_array.len());
- let null_count = in_array.null_count();
+impl<T: Ord> SortedLookup<T> {
+ fn new(mut values: Vec<T>) -> Self {
+ values.sort_unstable();
+ values.dedup();
+ Self(values)
+ }
- for v in in_array.iter().flatten() {
- values.insert(v);
- }
+ #[inline]
+ fn contains(&self, value: &T) -> bool {
+ self.0.binary_search(value).is_ok()
Review Comment:
Yes, you should probably just revert my commit
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]