alamb commented on code in PR #8440:
URL: https://github.com/apache/arrow-datafusion/pull/8440#discussion_r1431511348
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -993,95 +1102,139 @@ mod tests {
///
/// Note All `ArrayRefs` must be the same size.
struct ContainerStats {
- min: ArrayRef,
- max: ArrayRef,
+ min: Option<ArrayRef>,
Review Comment:
I modified this fixture to support different combinations of
min/max/contained
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -2484,10 +2617,376 @@ mod tests {
// TODO: add other negative test for other case and op
}
+ #[test]
Review Comment:
The new tests start here
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -2484,10 +2630,466 @@ mod tests {
// TODO: add other negative test for other case and op
}
+ #[test]
+ fn prune_with_contained_one_column() {
+ let schema = Arc::new(Schema::new(vec![Field::new("s1",
DataType::Utf8, true)]));
+
+ // Model having information like a bloom filter for s1
+ let statistics = TestStatistics::new()
+ .with_contained(
+ "s1",
+ [ScalarValue::from("foo")],
+ [
+ // container 0 known to only contain "foo"",
+ Some(true),
+ // container 1 known to not contain "foo"
+ Some(false),
+ // container 2 unknown about "foo"
+ None,
+ // container 3 known to only contain "foo"
+ Some(true),
+ // container 4 known to not contain "foo"
+ Some(false),
+ // container 5 unknown about "foo"
+ None,
+ // container 6 known to only contain "foo"
+ Some(true),
+ // container 7 known to not contain "foo"
+ Some(false),
+ // container 8 unknown about "foo"
+ None,
+ ],
+ )
+ .with_contained(
+ "s1",
+ [ScalarValue::from("bar")],
+ [
+ // containers 0,1,2 known to only contain "bar"
+ Some(true),
+ Some(true),
+ Some(true),
+ // container 3,4,5 known to not contain "bar"
+ Some(false),
+ Some(false),
+ Some(false),
+ // container 6,7,8 unknown about "bar"
+ None,
+ None,
+ None,
+ ],
+ )
+ .with_contained(
+ // the way the tests are setup, this data is
+ // consulted if the "foo" and "bar" are being checked at the
same time
+ "s1",
+ [ScalarValue::from("foo"), ScalarValue::from("bar")],
+ [
+ // container 0,1,2 unknown about ("foo, "bar")
+ None,
+ None,
+ None,
+ // container 3,4,5 known to contain only either "foo" and
"bar"
+ Some(true),
+ Some(true),
+ Some(true),
+ // container 6,7,8 known ro contain neither "foo" and
"bar"
+ Some(false),
+ Some(false),
+ Some(false),
+ ],
+ );
+
+ // s1 = 'foo'
+ prune_with_expr(
+ col("s1").eq(lit("foo")),
+ &schema,
+ &statistics,
+ // rule out containers ('false) where we know foo is not present
+ vec![true, false, true, true, false, true, true, false, true],
+ );
+
+ // s1 = 'bar'
+ prune_with_expr(
+ col("s1").eq(lit("bar")),
+ &schema,
+ &statistics,
+ // rule out containers where we know bar is not present
+ vec![true, true, true, false, false, false, true, true, true],
+ );
+
+ // s1 = 'baz' (unknown value)
+ prune_with_expr(
+ col("s1").eq(lit("baz")),
+ &schema,
+ &statistics,
+ // can't rule out anything
+ vec![true, true, true, true, true, true, true, true, true],
+ );
+
+ // s1 = 'foo' AND s1 = 'bar'
+ prune_with_expr(
+ col("s1").eq(lit("foo")).and(col("s1").eq(lit("bar"))),
+ &schema,
+ &statistics,
+ // logically this predicate can't possibly be true (the column
can't
+ // take on both values) but we could rule it out if the stats tell
+ // us that both values are not present
+ vec![true, true, true, true, true, true, true, true, true],
Review Comment:
Yes, you can say that by returning `false` for `contained("s1", {foo, bar})`
However, in this case I think what happens is we end up with two distinct
literal guarantees and the container would have to know that a container only
had foo AND only had bar, which is logically impossible.
So in other words, this expression
```
Pruning with expr: s1 != Utf8("foo") AND s2 != Utf8("bar")
```
Generates these guarantees:
```
Got guarantees: [
LiteralGuarantee { column: Column { relation: None, name: "s1" },
guarantee: NotIn, literals: {Utf8("foo")} },
LiteralGuarantee { column: Column { relation: None, name: "s2" },
guarantee: NotIn, literals: {Utf8("bar")} }
]
```
I think it would be possiible to do another round of analysis on this and
prove this can never be true. I am not sure how important the use case is
however.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]