adriangb commented on code in PR #18832:
URL: https://github.com/apache/datafusion/pull/18832#discussion_r2603405458


##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -198,65 +248,414 @@ impl ArrayStaticFilter {
     }
 }
 
-struct Int32StaticFilter {
-    null_count: usize,
-    values: HashSet<i32>,
+/// Threshold for switching from sorted Vec (binary search) to HashSet
+/// For small lists, binary search has better cache locality and lower overhead
+/// Maximum list size for using sorted lookup (binary search).
+/// Lists with more elements use hash lookup instead.
+const SORTED_LOOKUP_MAX_LEN: usize = 8;
+
+/// Helper to build a BooleanArray result for IN list operations.
+/// Handles SQL three-valued logic for NULL values.
+///
+/// # Arguments
+/// * `len` - Number of elements in the needle array
+/// * `needle_nulls` - Optional validity buffer from the needle array
+/// * `haystack_has_nulls` - Whether the IN list contains NULL values
+/// * `negated` - Whether this is a NOT IN operation
+/// * `contains` - Closure that returns whether needle[i] is found in the 
haystack
+#[inline]
+fn build_in_list_result<C>(
+    len: usize,
+    needle_nulls: Option<&NullBuffer>,
+    haystack_has_nulls: bool,
+    negated: bool,
+    contains: C,
+) -> BooleanArray
+where
+    C: Fn(usize) -> bool,
+{
+    // Use collect_bool for all paths - it's vectorized and faster than 
element-by-element append.
+    // Match on (needle_has_nulls, haystack_has_nulls, negated) to specialize 
each case.
+    match (needle_nulls, haystack_has_nulls, negated) {
+        // Haystack has nulls: result is NULL when not found (might match the 
NULL)
+        // values_buf == nulls_buf, so compute once and clone
+        (Some(validity), true, false) => {
+            let contains_buf = BooleanBuffer::collect_bool(len, |i| 
contains(i));
+            let buf = validity.inner() & &contains_buf;
+            BooleanArray::new(buf.clone(), Some(NullBuffer::new(buf)))
+        }
+        (None, true, false) => {
+            let buf = BooleanBuffer::collect_bool(len, |i| contains(i));
+            BooleanArray::new(buf.clone(), Some(NullBuffer::new(buf)))
+        }
+        (Some(validity), true, true) => {
+            // Compute nulls_buf via SIMD AND, then derive values_buf via XOR.
+            // Uses identity: A & !B = A ^ (A & B) to get values from nulls.
+            let contains_buf = BooleanBuffer::collect_bool(len, |i| 
contains(i));
+            let nulls_buf = validity.inner() & &contains_buf;
+            let values_buf = validity.inner() ^ &nulls_buf; // valid & 
!contains
+            BooleanArray::new(values_buf, Some(NullBuffer::new(nulls_buf)))
+        }
+        (None, true, true) => {
+            // No needle nulls, but haystack has nulls
+            let contains_buf = BooleanBuffer::collect_bool(len, |i| 
contains(i));
+            let values_buf = !&contains_buf;
+            BooleanArray::new(values_buf, Some(NullBuffer::new(contains_buf)))
+        }
+        // Only needle has nulls: nulls_buf is just validity (reuse it 
directly!)
+        (Some(validity), false, false) => {
+            let contains_buf = BooleanBuffer::collect_bool(len, |i| 
contains(i));
+            let values_buf = validity.inner() & &contains_buf;
+            BooleanArray::new(values_buf, Some(validity.clone()))
+        }
+        (Some(validity), false, true) => {
+            let contains_buf = BooleanBuffer::collect_bool(len, |i| 
contains(i));
+            let values_buf = validity.inner() & &(!&contains_buf);
+            BooleanArray::new(values_buf, Some(validity.clone()))
+        }
+        // No nulls anywhere: no validity buffer needed
+        (None, false, false) => {
+            let buf = BooleanBuffer::collect_bool(len, |i| contains(i));
+            BooleanArray::new(buf, None)
+        }
+        (None, false, true) => {
+            let buf = BooleanBuffer::collect_bool(len, |i| !contains(i));
+            BooleanArray::new(buf, None)
+        }
+    }
 }
 
-impl Int32StaticFilter {
-    fn try_new(in_array: &ArrayRef) -> Result<Self> {
-        let in_array = in_array
-            .as_primitive_opt::<Int32Type>()
-            .ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?;
+/// Sorted lookup using binary search. Best for small lists (< 8 elements).
+struct SortedLookup<T>(Vec<T>);
 
-        let mut values = HashSet::with_capacity(in_array.len());
-        let null_count = in_array.null_count();
+impl<T: Ord> SortedLookup<T> {
+    fn new(mut values: Vec<T>) -> Self {
+        values.sort_unstable();
+        values.dedup();
+        Self(values)
+    }
 
-        for v in in_array.iter().flatten() {
-            values.insert(v);
-        }
+    #[inline]
+    fn contains(&self, value: &T) -> bool {
+        self.0.binary_search(value).is_ok()

Review Comment:
   Yes I think we need to freeze this (and maybe even split it up). Since you 
have some great ideas and can execute them I would love it you could open an 
issue tracking the "plan" and then we can make smaller targeted PRs off of 
that. I personally have *a lot* of other DataFusion work on my plate so I'd 
love to hand off a bit of this to those with better ideas and more interest and 
I can then help review PRs, etc?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to