alamb commented on code in PR #2809:
URL: https://github.com/apache/arrow-datafusion/pull/2809#discussion_r912340297


##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn in_list_set_bool() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Boolean, 
true)]);
+        let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![Arc::new(a)])?;
+
+        // expression: "a in (true,null,true.....)"
+        let mut list = vec![
+            lit(ScalarValue::Boolean(Some(true))),
+            lit(ScalarValue::Boolean(None)),
+        ];
+        for _ in 1..(OPTIMIZER_INSET_THRESHOLD + 1) {

Review Comment:
   I don't understand the choice of bounds of `1` ... 
`OPTIMIZER_INSET_THRESHOLD + 1` -- why not `0..OPTIMIZER_INSET_THRESHOLD`? Not 
that this way is wrong, I just don't understand it



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn in_list_set_bool() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Boolean, 
true)]);
+        let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![Arc::new(a)])?;
+
+        // expression: "a in (true,null,true.....)"
+        let mut list = vec![
+            lit(ScalarValue::Boolean(Some(true))),
+            lit(ScalarValue::Boolean(None)),
+        ];
+        for _ in 1..(OPTIMIZER_INSET_THRESHOLD + 1) {
+            list.push(lit(ScalarValue::Boolean(Some(true))));
+        }
+        in_list!(
+            batch,
+            list.clone(),
+            &false,
+            vec![Some(true), None, None],
+            col_a.clone(),
+            &schema
+        );
+        in_list!(
+            batch,
+            list,
+            &true,
+            vec![Some(false), None, None],
+            col_a.clone(),
+            &schema
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_set_int64() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let a = Int64Array::from(vec![Some(0), Some(2), None]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![Arc::new(a)])?;
+
+        // expression: "a in (0,3,4....)"
+        let mut list = vec![
+            lit(ScalarValue::Int64(Some(0))),
+            lit(ScalarValue::Int64(None)),
+            lit(ScalarValue::Int64(Some(3))),
+        ];
+        for v in 4..(OPTIMIZER_INSET_THRESHOLD + 4) {
+            list.push(lit(ScalarValue::Int64(Some(v as i64))));
+        }
+
+        in_list!(
+            batch,
+            list.clone(),
+            &false,
+            vec![Some(true), None, None],
+            col_a.clone(),
+            &schema
+        );
+
+        in_list!(
+            batch,
+            list.clone(),
+            &true,
+            vec![Some(false), None, None],
+            col_a.clone(),
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_set_float64() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Float64, 
true)]);
+        let a = Float64Array::from(vec![Some(0.0), Some(2.0), None]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![Arc::new(a)])?;
+
+        // expression: "a in (0.0,3.0,4.0 ....)"

Review Comment:
   ```suggestion
           // expression: "a in (0.0,Null,3.0,4.0 ....)"
   ```



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -163,16 +139,14 @@ macro_rules! make_contains_primitive {
                 ColumnarValue::Scalar(s) => match s {
                     ScalarValue::$SCALAR_VALUE(Some(v)) => Some(*v),
                     ScalarValue::$SCALAR_VALUE(None) => None,
-                    // TODO this is bug, for primitive the expr list should be 
cast to the same data type
-                    ScalarValue::Utf8(None) => None,
-                    datatype => unimplemented!("Unexpected type {} for 
InList", datatype),
+                    datatype => unreachable!("InList can't reach other data 
type {} for {}.", datatype, s),
                 },
                 ColumnarValue::Array(_) => {
                     unimplemented!("InList does not yet support nested 
columns.")
                 }
             })
             .collect::<Vec<_>>();
-
+        // TODO do we need to replace this below logic by `in_list_primitive`?

Review Comment:
   I reviewed the logic below and it looked correct to me -- are you asking 
about the trying to refactor to reduce duplication?



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
 
         Ok(())
     }
+

Review Comment:
   I really like the test coverage



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn in_list_set_bool() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Boolean, 
true)]);
+        let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![Arc::new(a)])?;
+
+        // expression: "a in (true,null,true.....)"
+        let mut list = vec![
+            lit(ScalarValue::Boolean(Some(true))),
+            lit(ScalarValue::Boolean(None)),
+        ];
+        for _ in 1..(OPTIMIZER_INSET_THRESHOLD + 1) {
+            list.push(lit(ScalarValue::Boolean(Some(true))));
+        }
+        in_list!(
+            batch,
+            list.clone(),
+            &false,
+            vec![Some(true), None, None],
+            col_a.clone(),
+            &schema
+        );
+        in_list!(
+            batch,
+            list,
+            &true,
+            vec![Some(false), None, None],
+            col_a.clone(),
+            &schema
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_set_int64() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let a = Int64Array::from(vec![Some(0), Some(2), None]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![Arc::new(a)])?;
+
+        // expression: "a in (0,3,4....)"

Review Comment:
   ```suggestion
           // expression: "a in (0,Null,3,4....)"
   ```



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -209,26 +183,109 @@ macro_rules! make_contains_primitive {
     }};
 }
 
-macro_rules! set_contains_with_negated {
-    ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr) => {{
-        if $NEGATED {
-            return Ok(ColumnarValue::Array(Arc::new(
+macro_rules! set_contains_for_float {
+    ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr, 
$PHY_TYPE:ty) => {{
+        let contains_null = $SET_VALUES.iter().any(|s| s.is_null());
+        let bool_array = if $NEGATED {
+            // Not in
+            if contains_null {
                 $ARRAY
                     .iter()
-                    .map(|x| x.map(|v| 
!$LIST_VALUES.contains(&v.try_into().unwrap())))
-                    .collect::<BooleanArray>(),
-            )));
+                    .map(|vop| {
+                        match vop.map(|v| 
!$SET_VALUES.contains(&v.try_into().unwrap())) {

Review Comment:
   Elsewhere in DataFusion (including in `ScalarValue`) we use `ordered_float` 
to compare floating point numbers
   
   It might be possible  to use `set::<OrderedFloat<f32>>`, which would be more 
space efficient (fewer bytes than `ScalarValue`) as well as faster (as the 
comparison doens't have to dispatch on the type each time)
   
   
   
https://github.com/apache/arrow-datafusion/blob/88b88d4360054a85982987aa07b3f3afd2db7d70/datafusion/common/src/scalar.rs#L33



##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn in_list_set_bool() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Boolean, 
true)]);
+        let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![Arc::new(a)])?;
+
+        // expression: "a in (true,null,true.....)"
+        let mut list = vec![
+            lit(ScalarValue::Boolean(Some(true))),
+            lit(ScalarValue::Boolean(None)),
+        ];
+        for _ in 1..(OPTIMIZER_INSET_THRESHOLD + 1) {
+            list.push(lit(ScalarValue::Boolean(Some(true))));
+        }
+        in_list!(
+            batch,
+            list.clone(),
+            &false,
+            vec![Some(true), None, None],

Review Comment:
   👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to