tustvold commented on code in PR #4057:
URL: https://github.com/apache/arrow-datafusion/pull/4057#discussion_r1010214548


##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -70,320 +57,185 @@ impl Debug for InListExpr {
             .field("expr", &self.expr)
             .field("list", &self.list)
             .field("negated", &self.negated)
-            .field("set", &self.set)
             .finish()
     }
 }
 
-/// InSet
-#[derive(Debug, PartialEq, Eq)]
-pub struct InSet {
-    // TODO: optimization: In the `IN` or `NOT IN` we don't need to consider 
the NULL value
-    // The data type is same, we can use  set: HashSet<T>
-    set: HashSet<ScalarValue>,
+/// A type-erased container of array elements
+trait Set: Send + Sync {
+    fn contains(&self, v: &dyn Array, negated: bool) -> BooleanArray;
 }
 
-impl InSet {
-    pub fn new(set: HashSet<ScalarValue>) -> Self {
-        Self { set }
-    }
-
-    pub fn get_set(&self) -> &HashSet<ScalarValue> {
-        &self.set
-    }
+struct ArrayHashSet {
+    state: RandomState,
+    /// Used to provide a lookup from value to in list index
+    ///
+    /// Note: usize::hash is not used, instead the raw entry
+    /// API is used to store entries w.r.t their value has
+    map: HashMap<usize, (), ()>,
 }
 
-macro_rules! make_contains {
-    ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr, $SCALAR_VALUE:ident, 
$ARRAY_TYPE:ident) => {{
-        let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
-
-        let contains_null = $LIST_VALUES
-            .iter()
-            .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
-        let values = $LIST_VALUES
-            .iter()
-            .flat_map(|expr| match expr {
-                ColumnarValue::Scalar(s) => match s {
-                    ScalarValue::$SCALAR_VALUE(Some(v)) => Some(*v),
-                    ScalarValue::$SCALAR_VALUE(None) => None,
-                    datatype => unreachable!("InList can't reach other data 
type {} for {}.", datatype, s),
-                },
-                ColumnarValue::Array(_) => {
-                    unimplemented!("InList does not yet support nested 
columns.")
-                }
-            })
-            .collect::<Vec<_>>();
-
-        collection_contains_check!(array, values, $NEGATED, contains_null)
-    }};
+struct ArraySet<T> {
+    array: T,
+    hash_set: Option<ArrayHashSet>,
 }
 
-macro_rules! make_contains_primitive {
-    ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr, $SCALAR_VALUE:ident, 
$ARRAY_TYPE:ident) => {{
-        let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
-
-        let contains_null = $LIST_VALUES
-            .iter()
-            .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
-        let values = $LIST_VALUES
-            .iter()
-            .flat_map(|expr| match expr {
-                ColumnarValue::Scalar(s) => match s {
-                    ScalarValue::$SCALAR_VALUE(Some(v), ..) => Some(*v),
-                    ScalarValue::$SCALAR_VALUE(None, ..) => None,
-                    datatype => unreachable!("InList can't reach other data 
type {} for {}.", datatype, s),
-                },
-                ColumnarValue::Array(_) => {
-                    unimplemented!("InList does not yet support nested 
columns.")
-                }
-            })
-            .collect::<Vec<_>>();
-
-        Ok(collection_contains_check!(array, values, $NEGATED, contains_null))
-    }};
+impl<T> ArraySet<T>
+where
+    T: Array + From<ArrayData>,
+{
+    fn new(array: &T, hash_set: Option<ArrayHashSet>) -> Self {
+        Self {
+            array: T::from(array.data().clone()),
+            hash_set,
+        }
+    }
 }
 
-macro_rules! set_contains_for_float {
-    ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr) => {{
-        let contains_null = $SET_VALUES.iter().any(|s| s.is_null());
-        let bool_array = if $NEGATED {
-            // Not in
-            if contains_null {
-                $ARRAY
-                    .iter()
-                    .map(|vop| {
-                        match vop.map(|v| 
!$SET_VALUES.contains(&v.try_into().unwrap())) {
-                            Some(true) => None,
-                            x => x,
+impl<T> Set for ArraySet<T>
+where
+    T: Array + 'static,
+    for<'a> &'a T: ArrayAccessor,
+    for<'a> <&'a T as ArrayAccessor>::Item: PartialEq + HashValue,

Review Comment:
   This is gross, not really sure how to avoid it though as the lifetime of the 
downcast `& dyn Array` necessarily has a lifetime less than the lifetime of 
ArraySet... So we need to HRTB magic...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to