alamb commented on code in PR #4057:
URL: https://github.com/apache/arrow-datafusion/pull/4057#discussion_r1012115447


##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -70,320 +51,172 @@ impl Debug for InListExpr {
             .field("expr", &self.expr)
             .field("list", &self.list)
             .field("negated", &self.negated)
-            .field("set", &self.set)
             .finish()
     }
 }
 
-/// InSet
-#[derive(Debug, PartialEq, Eq)]
-pub struct InSet {
-    // TODO: optimization: In the `IN` or `NOT IN` we don't need to consider 
the NULL value
-    // The data type is same, we can use  set: HashSet<T>
-    set: HashSet<ScalarValue>,
+/// A type-erased container of array elements
+trait Set: Send + Sync {
+    fn contains(&self, v: &dyn Array, negated: bool) -> BooleanArray;
 }
 
-impl InSet {
-    pub fn new(set: HashSet<ScalarValue>) -> Self {
-        Self { set }
-    }
-
-    pub fn get_set(&self) -> &HashSet<ScalarValue> {
-        &self.set
-    }
+struct ArrayHashSet {
+    state: RandomState,
+    /// Used to provide a lookup from value to in list index
+    ///
+    /// Note: usize::hash is not used, instead the raw entry
+    /// API is used to store entries w.r.t their value has
+    map: HashMap<usize, (), ()>,
 }
 
-macro_rules! make_contains {
-    ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr, $SCALAR_VALUE:ident, 
$ARRAY_TYPE:ident) => {{
-        let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
-
-        let contains_null = $LIST_VALUES
-            .iter()
-            .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
-        let values = $LIST_VALUES
-            .iter()
-            .flat_map(|expr| match expr {
-                ColumnarValue::Scalar(s) => match s {
-                    ScalarValue::$SCALAR_VALUE(Some(v)) => Some(*v),
-                    ScalarValue::$SCALAR_VALUE(None) => None,
-                    datatype => unreachable!("InList can't reach other data 
type {} for {}.", datatype, s),
-                },
-                ColumnarValue::Array(_) => {
-                    unimplemented!("InList does not yet support nested 
columns.")
-                }
-            })
-            .collect::<Vec<_>>();
-
-        collection_contains_check!(array, values, $NEGATED, contains_null)
-    }};
+struct ArraySet<T> {
+    array: T,
+    hash_set: ArrayHashSet,
 }
 
-macro_rules! make_contains_primitive {
-    ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr, $SCALAR_VALUE:ident, 
$ARRAY_TYPE:ident) => {{
-        let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
-
-        let contains_null = $LIST_VALUES
-            .iter()
-            .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
-        let values = $LIST_VALUES
-            .iter()
-            .flat_map(|expr| match expr {
-                ColumnarValue::Scalar(s) => match s {
-                    ScalarValue::$SCALAR_VALUE(Some(v), ..) => Some(*v),
-                    ScalarValue::$SCALAR_VALUE(None, ..) => None,
-                    datatype => unreachable!("InList can't reach other data 
type {} for {}.", datatype, s),
-                },
-                ColumnarValue::Array(_) => {
-                    unimplemented!("InList does not yet support nested 
columns.")
-                }
-            })
-            .collect::<Vec<_>>();
-
-        Ok(collection_contains_check!(array, values, $NEGATED, contains_null))
-    }};
-}
-
-macro_rules! set_contains_for_float {
-    ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr) => {{
-        let contains_null = $SET_VALUES.iter().any(|s| s.is_null());
-        let bool_array = if $NEGATED {
-            // Not in
-            if contains_null {
-                $ARRAY
-                    .iter()
-                    .map(|vop| {
-                        match vop.map(|v| 
!$SET_VALUES.contains(&v.try_into().unwrap())) {
-                            Some(true) => None,
-                            x => x,
-                        }
-                    })
-                    .collect::<BooleanArray>()
-            } else {
-                $ARRAY
-                    .iter()
-                    .map(|vop| vop.map(|v| 
!$SET_VALUES.contains(&v.try_into().unwrap())))
-                    .collect::<BooleanArray>()
-            }
-        } else {
-            // In
-            if contains_null {
-                $ARRAY
-                    .iter()
-                    .map(|vop| {
-                        match vop.map(|v| 
$SET_VALUES.contains(&v.try_into().unwrap())) {
-                            Some(false) => None,
-                            x => x,
-                        }
-                    })
-                    .collect::<BooleanArray>()
-            } else {
-                $ARRAY
-                    .iter()
-                    .map(|vop| vop.map(|v| 
$SET_VALUES.contains(&v.try_into().unwrap())))
-                    .collect::<BooleanArray>()
-            }
-        };
-        ColumnarValue::Array(Arc::new(bool_array))
-    }};
+impl<T> ArraySet<T>
+where
+    T: Array + From<ArrayData>,
+{
+    fn new(array: &T, hash_set: ArrayHashSet) -> Self {
+        Self {
+            array: T::from(array.data().clone()),
+            hash_set,
+        }
+    }
 }
 
-macro_rules! set_contains_for_primitive {
-    ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr) => {{
-        let contains_null = $SET_VALUES.iter().any(|s| s.is_null());
-        let native_set = $SET_VALUES
-            .iter()
-            .flat_map(|v| match v {
-                $SCALAR_VALUE(value, ..) => *value,
-                datatype => {
-                    unreachable!(
-                        "InList can't reach other data type {} for {}.",
-                        datatype, v
-                    )
-                }
+impl<T> Set for ArraySet<T>
+where
+    T: Array + 'static,
+    for<'a> &'a T: ArrayAccessor,
+    for<'a> <&'a T as ArrayAccessor>::Item: PartialEq + HashValue,
+{
+    fn contains(&self, v: &dyn Array, negated: bool) -> BooleanArray {
+        let v = v.as_any().downcast_ref::<T>().unwrap();
+        let in_data = self.array.data();
+        let in_array = &self.array;
+        let has_nulls = in_data.null_count() != 0;
+
+        ArrayIter::new(v)

Review Comment:
   I wonder if it would be more performant to use `hash_array` to has the array 
all in one go rather than doing it an element at a time? 
   
   I was thinking that since this is the hot path the more vectorization we get 
the better



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -70,320 +51,172 @@ impl Debug for InListExpr {
             .field("expr", &self.expr)
             .field("list", &self.list)
             .field("negated", &self.negated)
-            .field("set", &self.set)
             .finish()
     }
 }
 
-/// InSet
-#[derive(Debug, PartialEq, Eq)]
-pub struct InSet {
-    // TODO: optimization: In the `IN` or `NOT IN` we don't need to consider 
the NULL value
-    // The data type is same, we can use  set: HashSet<T>
-    set: HashSet<ScalarValue>,
+/// A type-erased container of array elements
+trait Set: Send + Sync {
+    fn contains(&self, v: &dyn Array, negated: bool) -> BooleanArray;
 }
 
-impl InSet {
-    pub fn new(set: HashSet<ScalarValue>) -> Self {
-        Self { set }
-    }
-
-    pub fn get_set(&self) -> &HashSet<ScalarValue> {
-        &self.set
-    }
+struct ArrayHashSet {
+    state: RandomState,
+    /// Used to provide a lookup from value to in list index
+    ///
+    /// Note: usize::hash is not used, instead the raw entry
+    /// API is used to store entries w.r.t their value has

Review Comment:
   ```suggestion
       /// API is used to store entries w.r.t their value
   ```



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -70,320 +51,172 @@ impl Debug for InListExpr {
             .field("expr", &self.expr)
             .field("list", &self.list)
             .field("negated", &self.negated)
-            .field("set", &self.set)
             .finish()
     }
 }
 
-/// InSet
-#[derive(Debug, PartialEq, Eq)]
-pub struct InSet {
-    // TODO: optimization: In the `IN` or `NOT IN` we don't need to consider 
the NULL value
-    // The data type is same, we can use  set: HashSet<T>
-    set: HashSet<ScalarValue>,
+/// A type-erased container of array elements
+trait Set: Send + Sync {
+    fn contains(&self, v: &dyn Array, negated: bool) -> BooleanArray;
 }
 
-impl InSet {
-    pub fn new(set: HashSet<ScalarValue>) -> Self {
-        Self { set }
-    }
-
-    pub fn get_set(&self) -> &HashSet<ScalarValue> {
-        &self.set
-    }
+struct ArrayHashSet {
+    state: RandomState,
+    /// Used to provide a lookup from value to in list index
+    ///
+    /// Note: usize::hash is not used, instead the raw entry
+    /// API is used to store entries w.r.t their value has

Review Comment:
   This pattern is used over and over again (in arrow as well) -- basically it 
is a HashSet that doesn't actually store the set members I think
   



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -945,7 +323,6 @@ impl PartialEq<dyn Any> for InListExpr {
                 self.expr.eq(&x.expr)
                     && expr_list_eq_any_order(&self.list, &x.list)
                     && self.negated == x.negated

Review Comment:
   ```suggestion
                       && self.negated == x.negated
                       // since self.static_filter is derived from self.list, 
not necessary to check here
   ```



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1809,58 +976,6 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn in_list_set_timestamp() -> Result<()> {

Review Comment:
   FWIW I verified there is still coverage of timestamp testing above



##########
datafusion/physical-expr/src/hash_utils.rs:
##########
@@ -68,15 +68,15 @@ hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, 
u64);
 hash_value!(bool, str, [u8]);
 
 macro_rules! hash_float_value {
-    ($($t:ty),+) => {
+    ($(($t:ty, $i:ty)),+) => {
         $(impl HashValue for $t {
             fn hash_one(&self, state: &RandomState) -> u64 {
-                state.hash_one(self.to_le_bytes())
+                state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))

Review Comment:
   I know it is a pain, but i would recommend pulling this into its own PR



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to