alamb commented on code in PR #8654:
URL: https://github.com/apache/arrow-datafusion/pull/8654#discussion_r1437008427
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -645,9 +680,107 @@ mod test {
);
}
- // TODO https://github.com/apache/arrow-datafusion/issues/8436
- // a IN (...)
- // b NOT IN (...)
+ #[test]
+ fn test_single_inlist() {
+ // b IN (1, 2, 3)
+ test_analyze(
+ col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
+ vec![in_guarantee("b", [1, 2, 3])],
+ );
+ // b NOT IN (1, 2, 3)
+ test_analyze(
+ col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
+ vec![not_in_guarantee("b", [1, 2, 3])],
+ );
+ }
+
+ #[test]
+ fn test_inlist_conjunction() {
+ // b IN (1, 2, 3) AND b IN (2, 3, 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
+ vec![in_guarantee("b", [2, 3])],
+ );
+ // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], true)
+ .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
+ vec![
+ not_in_guarantee("b", [1, 2, 3]),
+ in_guarantee("b", [2, 3, 4]),
+ ],
+ );
+ // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], true)
+ .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
+ vec![not_in_guarantee("b", [1, 2, 3, 4])],
+ );
+ // b IN (1, 2, 3) AND b = 4
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").eq(lit(4))),
+ vec![],
+ );
+ // b IN (1, 2, 3) AND b = 2
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").eq(lit(2))),
+ vec![in_guarantee("b", [2])],
+ );
+ // b IN (1, 2, 3) AND b != 2
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").not_eq(lit(2))),
+ vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
+ );
+ // b NOT IN (1, 2, 3) AND b != 4
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], true)
+ .and(col("b").not_eq(lit(4))),
+ vec![not_in_guarantee("b", [1, 2, 3, 4])],
+ );
+ // b NOT IN (1, 2, 3) AND b != 2
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], true)
+ .and(col("b").not_eq(lit(2))),
+ vec![not_in_guarantee("b", [1, 2, 3])],
+ );
+ }
+
+ #[test]
+ fn test_inlist_with_disjunction() {
Review Comment:
Could you also please add (negative) tests for the inlist directly used with
`OR` too ?
For example:
```
b IN (1, 2, 3) OR b = 2
b IN (1, 2, 3) OR b != 3
```
I think the code above will work correctly, but it would be nice to verify
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -645,9 +680,107 @@ mod test {
);
}
- // TODO https://github.com/apache/arrow-datafusion/issues/8436
- // a IN (...)
- // b NOT IN (...)
+ #[test]
+ fn test_single_inlist() {
+ // b IN (1, 2, 3)
+ test_analyze(
+ col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
+ vec![in_guarantee("b", [1, 2, 3])],
+ );
+ // b NOT IN (1, 2, 3)
+ test_analyze(
+ col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
+ vec![not_in_guarantee("b", [1, 2, 3])],
+ );
+ }
+
+ #[test]
+ fn test_inlist_conjunction() {
+ // b IN (1, 2, 3) AND b IN (2, 3, 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
+ vec![in_guarantee("b", [2, 3])],
Review Comment:
this is very cool
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -130,6 +124,39 @@ impl LiteralGuarantee {
.fold(GuaranteeBuilder::new(), |builder, expr| {
if let Some(cel) = ColOpLit::try_new(expr) {
return builder.aggregate_conjunct(cel);
+ } else if let Some(inlist) = expr
+ .as_any()
+ .downcast_ref::<crate::expressions::InListExpr>()
+ {
+ //Only support single-column inlist currently,
multi-column inlist is not supported
Review Comment:
This is really nice 👍
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -168,14 +195,21 @@ impl LiteralGuarantee {
// if all terms are 'col <op> literal' with the same column
// and operation we can infer any guarantees
+ //
+ // For those like (a != bar OR a != baz).
+ // We can't combine the (a != bar OR a != baz) part, but
+ // it also doesn't invalidate our knowledge that a !=
+ // foo is required for the expression to be true.
+ // So we can only create a multi guarantee for `=`
+ // (or a single value). (e.g. ignore `a != foo OR a !=
bar`)
let first_term = &terms[0];
if terms.iter().all(|term| {
term.col.name() == first_term.col.name()
- && term.op == first_term.op
+ && term.guarantee == Guarantee::In
Review Comment:
I also double checked that this will verify that `first_term.guarantee`
needs to be `In` as well ✅
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -334,21 +357,33 @@ impl<'a> ColOpLit<'a> {
binary_expr.op(),
binary_expr.right().as_any(),
);
-
+ let guarantee = match op {
Review Comment:
👍
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -130,6 +124,39 @@ impl LiteralGuarantee {
.fold(GuaranteeBuilder::new(), |builder, expr| {
if let Some(cel) = ColOpLit::try_new(expr) {
return builder.aggregate_conjunct(cel);
+ } else if let Some(inlist) = expr
+ .as_any()
+ .downcast_ref::<crate::expressions::InListExpr>()
+ {
+ //Only support single-column inlist currently,
multi-column inlist is not supported
+ let col = inlist
+ .expr()
+ .as_any()
+ .downcast_ref::<crate::expressions::Column>();
+ if col.is_none() {
+ return builder;
+ }
Review Comment:
I think you could avoid the unwraps below by using something like this
```suggestion
let Some(col) = col else {
return builder;
}
```
(and the same for `literals`)
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -257,26 +291,20 @@ impl<'a> GuaranteeBuilder<'a> {
// another `AND a != 6` we know that a must not be either 5 or
6
// for the expression to be true
Guarantee::NotIn => {
- // can extend if only single literal, otherwise invalidate
let new_values: HashSet<_> =
new_values.into_iter().collect();
- if new_values.len() == 1 {
-
existing.literals.extend(new_values.into_iter().cloned())
- } else {
- // this is like (a != foo AND (a != bar OR a != baz)).
- // We can't combine the (a != bar OR a != baz) part,
but
- // it also doesn't invalidate our knowledge that a !=
- // foo is required for the expression to be true
- }
+ existing.literals.extend(new_values.into_iter().cloned());
}
Guarantee::In => {
- // for an IN guarantee, it is ok if the value is the same
- // e.g. `a = foo AND a = foo` but not if the value is
different
- // e.g. `a = foo AND a = bar`
- if new_values
+ let intersection = new_values
.into_iter()
- .all(|new_value| existing.literals.contains(new_value))
- {
- // all values are already in the set
+ .filter(|new_value|
existing.literals.contains(*new_value))
+ .collect::<Vec<_>>();
+ // for an In guarantee, if the intersection is not empty,
we can extend the guarantee
+ // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)`
+ // otherwise, we invalidate the guarantee
+ // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()` ,
which is invalid
Review Comment:
```suggestion
// e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`,
which is invalid
```
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -645,9 +680,107 @@ mod test {
);
}
- // TODO https://github.com/apache/arrow-datafusion/issues/8436
- // a IN (...)
- // b NOT IN (...)
+ #[test]
+ fn test_single_inlist() {
+ // b IN (1, 2, 3)
+ test_analyze(
+ col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
+ vec![in_guarantee("b", [1, 2, 3])],
+ );
+ // b NOT IN (1, 2, 3)
+ test_analyze(
+ col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
+ vec![not_in_guarantee("b", [1, 2, 3])],
+ );
+ }
+
+ #[test]
+ fn test_inlist_conjunction() {
+ // b IN (1, 2, 3) AND b IN (2, 3, 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
+ vec![in_guarantee("b", [2, 3])],
Review Comment:
This type of simplification might also be valuable to do in the Expr
simplifier code too (so that other optimizer passes could see simpler
predicates).
However, I am not sure how common such predicates are 🤔
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -334,21 +357,33 @@ impl<'a> ColOpLit<'a> {
binary_expr.op(),
binary_expr.right().as_any(),
);
-
+ let guarantee = match op {
+ Operator::Eq => Guarantee::In,
+ Operator::NotEq => Guarantee::NotIn,
+ _ => return None,
+ };
// col <op> literal
if let (Some(col), Some(lit)) = (
left.downcast_ref::<crate::expressions::Column>(),
right.downcast_ref::<crate::expressions::Literal>(),
) {
- Some(Self { col, op: *op, lit })
+ Some(Self {
+ col,
+ guarantee,
+ lit,
+ })
}
// literal <op> col
else if let (Some(lit), Some(col)) = (
left.downcast_ref::<crate::expressions::Literal>(),
right.downcast_ref::<crate::expressions::Column>(),
) {
// Used swapped operator operator, if possible
Review Comment:
this comment looks like it needs to be updated/removed
##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -1105,13 +1100,18 @@ mod tests {
let data = bytes::Bytes::from(std::fs::read(path).unwrap());
// generate pruning predicate
- let schema = Schema::new(vec![
- Field::new("String", DataType::Utf8, false),
- Field::new("String3", DataType::Utf8, false),
- ]);
- let sql =
- "SELECT * FROM tbl WHERE \"String\" IN ('Hello_Not_Exists',
'Hello_Not_Exists2')";
- let expr = sql_to_physical_plan(sql).unwrap();
Review Comment:
I agree -- this change makes sense
BTW I tried to improve these tests to avoid duplication in
https://github.com/apache/arrow-datafusion/pull/8435 (not yet merged). I'll try
and consolidate this test too when I get a chance
##########
datafusion/physical-expr/src/utils/guarantee.rs:
##########
@@ -645,9 +680,107 @@ mod test {
);
}
- // TODO https://github.com/apache/arrow-datafusion/issues/8436
- // a IN (...)
- // b NOT IN (...)
+ #[test]
+ fn test_single_inlist() {
+ // b IN (1, 2, 3)
+ test_analyze(
+ col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
+ vec![in_guarantee("b", [1, 2, 3])],
+ );
+ // b NOT IN (1, 2, 3)
+ test_analyze(
+ col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
+ vec![not_in_guarantee("b", [1, 2, 3])],
+ );
+ }
+
+ #[test]
+ fn test_inlist_conjunction() {
+ // b IN (1, 2, 3) AND b IN (2, 3, 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
+ vec![in_guarantee("b", [2, 3])],
+ );
+ // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], true)
+ .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
+ vec![
+ not_in_guarantee("b", [1, 2, 3]),
+ in_guarantee("b", [2, 3, 4]),
+ ],
+ );
+ // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], true)
+ .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
+ vec![not_in_guarantee("b", [1, 2, 3, 4])],
+ );
+ // b IN (1, 2, 3) AND b = 4
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").eq(lit(4))),
+ vec![],
+ );
+ // b IN (1, 2, 3) AND b = 2
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").eq(lit(2))),
+ vec![in_guarantee("b", [2])],
+ );
+ // b IN (1, 2, 3) AND b != 2
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").not_eq(lit(2))),
+ vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
+ );
+ // b NOT IN (1, 2, 3) AND b != 4
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], true)
+ .and(col("b").not_eq(lit(4))),
+ vec![not_in_guarantee("b", [1, 2, 3, 4])],
+ );
+ // b NOT IN (1, 2, 3) AND b != 2
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], true)
+ .and(col("b").not_eq(lit(2))),
+ vec![not_in_guarantee("b", [1, 2, 3])],
+ );
+ }
+
+ #[test]
+ fn test_inlist_with_disjunction() {
+ // b IN (1, 2, 3) AND (b = 3 OR b = 4)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
+ vec![in_guarantee("b", [3])],
+ );
+ // b IN (1, 2, 3) AND (b = 4 OR b = 5)
+ test_analyze(
+ col("b")
+ .in_list(vec![lit(1), lit(2), lit(3)], false)
+ .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))),
+ vec![],
Review Comment:
I theory this could be `in_guarantee(1,2,3)` right? As it can only be true
when b is 1,2,3.
However in this case this expression can never be true so maybe it doesn't
really matter 🤔
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]