alamb commented on code in PR #2809:
URL: https://github.com/apache/arrow-datafusion/pull/2809#discussion_r912340297
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn in_list_set_bool() -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", DataType::Boolean,
true)]);
+ let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
+ let col_a = col("a", &schema)?;
+ let batch = RecordBatch::try_new(Arc::new(schema.clone()),
vec![Arc::new(a)])?;
+
+ // expression: "a in (true,null,true.....)"
+ let mut list = vec![
+ lit(ScalarValue::Boolean(Some(true))),
+ lit(ScalarValue::Boolean(None)),
+ ];
+ for _ in 1..(OPTIMIZER_INSET_THRESHOLD + 1) {
Review Comment:
I don't understand the choice of bounds of `1` ...
`OPTIMIZER_INSET_THRESHOLD + 1` -- why not `0..OPTIMIZER_INSET_THRESHOLD`? Not
that this way is wrong, I just don't understand it
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn in_list_set_bool() -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", DataType::Boolean,
true)]);
+ let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
+ let col_a = col("a", &schema)?;
+ let batch = RecordBatch::try_new(Arc::new(schema.clone()),
vec![Arc::new(a)])?;
+
+ // expression: "a in (true,null,true.....)"
+ let mut list = vec![
+ lit(ScalarValue::Boolean(Some(true))),
+ lit(ScalarValue::Boolean(None)),
+ ];
+ for _ in 1..(OPTIMIZER_INSET_THRESHOLD + 1) {
+ list.push(lit(ScalarValue::Boolean(Some(true))));
+ }
+ in_list!(
+ batch,
+ list.clone(),
+ &false,
+ vec![Some(true), None, None],
+ col_a.clone(),
+ &schema
+ );
+ in_list!(
+ batch,
+ list,
+ &true,
+ vec![Some(false), None, None],
+ col_a.clone(),
+ &schema
+ );
+ Ok(())
+ }
+
+ #[test]
+ fn in_list_set_int64() -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+ let a = Int64Array::from(vec![Some(0), Some(2), None]);
+ let col_a = col("a", &schema)?;
+ let batch = RecordBatch::try_new(Arc::new(schema.clone()),
vec![Arc::new(a)])?;
+
+ // expression: "a in (0,3,4....)"
+ let mut list = vec![
+ lit(ScalarValue::Int64(Some(0))),
+ lit(ScalarValue::Int64(None)),
+ lit(ScalarValue::Int64(Some(3))),
+ ];
+ for v in 4..(OPTIMIZER_INSET_THRESHOLD + 4) {
+ list.push(lit(ScalarValue::Int64(Some(v as i64))));
+ }
+
+ in_list!(
+ batch,
+ list.clone(),
+ &false,
+ vec![Some(true), None, None],
+ col_a.clone(),
+ &schema
+ );
+
+ in_list!(
+ batch,
+ list.clone(),
+ &true,
+ vec![Some(false), None, None],
+ col_a.clone(),
+ &schema
+ );
+
+ Ok(())
+ }
+
+ #[test]
+ fn in_list_set_float64() -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", DataType::Float64,
true)]);
+ let a = Float64Array::from(vec![Some(0.0), Some(2.0), None]);
+ let col_a = col("a", &schema)?;
+ let batch = RecordBatch::try_new(Arc::new(schema.clone()),
vec![Arc::new(a)])?;
+
+ // expression: "a in (0.0,3.0,4.0 ....)"
Review Comment:
```suggestion
// expression: "a in (0.0,Null,3.0,4.0 ....)"
```
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -163,16 +139,14 @@ macro_rules! make_contains_primitive {
ColumnarValue::Scalar(s) => match s {
ScalarValue::$SCALAR_VALUE(Some(v)) => Some(*v),
ScalarValue::$SCALAR_VALUE(None) => None,
- // TODO this is bug, for primitive the expr list should be
cast to the same data type
- ScalarValue::Utf8(None) => None,
- datatype => unimplemented!("Unexpected type {} for
InList", datatype),
+ datatype => unreachable!("InList can't reach other data
type {} for {}.", datatype, s),
},
ColumnarValue::Array(_) => {
unimplemented!("InList does not yet support nested
columns.")
}
})
.collect::<Vec<_>>();
-
+ // TODO do we need to replace this below logic by `in_list_primitive`?
Review Comment:
I reviewed the logic below and it looked correct to me -- are you asking
about the trying to refactor to reduce duplication?
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
Ok(())
}
+
Review Comment:
I really like the test coverage
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn in_list_set_bool() -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", DataType::Boolean,
true)]);
+ let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
+ let col_a = col("a", &schema)?;
+ let batch = RecordBatch::try_new(Arc::new(schema.clone()),
vec![Arc::new(a)])?;
+
+ // expression: "a in (true,null,true.....)"
+ let mut list = vec![
+ lit(ScalarValue::Boolean(Some(true))),
+ lit(ScalarValue::Boolean(None)),
+ ];
+ for _ in 1..(OPTIMIZER_INSET_THRESHOLD + 1) {
+ list.push(lit(ScalarValue::Boolean(Some(true))));
+ }
+ in_list!(
+ batch,
+ list.clone(),
+ &false,
+ vec![Some(true), None, None],
+ col_a.clone(),
+ &schema
+ );
+ in_list!(
+ batch,
+ list,
+ &true,
+ vec![Some(false), None, None],
+ col_a.clone(),
+ &schema
+ );
+ Ok(())
+ }
+
+ #[test]
+ fn in_list_set_int64() -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+ let a = Int64Array::from(vec![Some(0), Some(2), None]);
+ let col_a = col("a", &schema)?;
+ let batch = RecordBatch::try_new(Arc::new(schema.clone()),
vec![Arc::new(a)])?;
+
+ // expression: "a in (0,3,4....)"
Review Comment:
```suggestion
// expression: "a in (0,Null,3,4....)"
```
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -209,26 +183,109 @@ macro_rules! make_contains_primitive {
}};
}
-macro_rules! set_contains_with_negated {
- ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr) => {{
- if $NEGATED {
- return Ok(ColumnarValue::Array(Arc::new(
+macro_rules! set_contains_for_float {
+ ($ARRAY:expr, $SET_VALUES:expr, $SCALAR_VALUE:ident, $NEGATED:expr,
$PHY_TYPE:ty) => {{
+ let contains_null = $SET_VALUES.iter().any(|s| s.is_null());
+ let bool_array = if $NEGATED {
+ // Not in
+ if contains_null {
$ARRAY
.iter()
- .map(|x| x.map(|v|
!$LIST_VALUES.contains(&v.try_into().unwrap())))
- .collect::<BooleanArray>(),
- )));
+ .map(|vop| {
+ match vop.map(|v|
!$SET_VALUES.contains(&v.try_into().unwrap())) {
Review Comment:
Elsewhere in DataFusion (including in `ScalarValue`) we use `ordered_float`
to compare floating point numbers
It might be possible to use `set::<OrderedFloat<f32>>`, which would be more
space efficient (fewer bytes than `ScalarValue`) as well as faster (as the
comparison doens't have to dispatch on the type each time)
https://github.com/apache/arrow-datafusion/blob/88b88d4360054a85982987aa07b3f3afd2db7d70/datafusion/common/src/scalar.rs#L33
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1139,4 +1242,192 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn in_list_set_bool() -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", DataType::Boolean,
true)]);
+ let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
+ let col_a = col("a", &schema)?;
+ let batch = RecordBatch::try_new(Arc::new(schema.clone()),
vec![Arc::new(a)])?;
+
+ // expression: "a in (true,null,true.....)"
+ let mut list = vec![
+ lit(ScalarValue::Boolean(Some(true))),
+ lit(ScalarValue::Boolean(None)),
+ ];
+ for _ in 1..(OPTIMIZER_INSET_THRESHOLD + 1) {
+ list.push(lit(ScalarValue::Boolean(Some(true))));
+ }
+ in_list!(
+ batch,
+ list.clone(),
+ &false,
+ vec![Some(true), None, None],
Review Comment:
👍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]