alamb commented on code in PR #8440:
URL: https://github.com/apache/arrow-datafusion/pull/8440#discussion_r1433076743
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -2484,10 +2630,466 @@ mod tests {
// TODO: add other negative test for other case and op
}
+ #[test]
+ fn prune_with_contained_one_column() {
+ let schema = Arc::new(Schema::new(vec![Field::new("s1",
DataType::Utf8, true)]));
+
+ // Model having information like a bloom filter for s1
+ let statistics = TestStatistics::new()
+ .with_contained(
+ "s1",
+ [ScalarValue::from("foo")],
+ [
+ // container 0 known to only contain "foo"",
+ Some(true),
+ // container 1 known to not contain "foo"
+ Some(false),
+ // container 2 unknown about "foo"
+ None,
+ // container 3 known to only contain "foo"
+ Some(true),
+ // container 4 known to not contain "foo"
+ Some(false),
+ // container 5 unknown about "foo"
+ None,
+ // container 6 known to only contain "foo"
+ Some(true),
+ // container 7 known to not contain "foo"
+ Some(false),
+ // container 8 unknown about "foo"
+ None,
+ ],
+ )
+ .with_contained(
+ "s1",
+ [ScalarValue::from("bar")],
+ [
+ // containers 0,1,2 known to only contain "bar"
+ Some(true),
+ Some(true),
+ Some(true),
+ // container 3,4,5 known to not contain "bar"
+ Some(false),
+ Some(false),
+ Some(false),
+ // container 6,7,8 unknown about "bar"
+ None,
+ None,
+ None,
+ ],
+ )
+ .with_contained(
+ // the way the tests are setup, this data is
+ // consulted if the "foo" and "bar" are being checked at the
same time
+ "s1",
+ [ScalarValue::from("foo"), ScalarValue::from("bar")],
+ [
+ // container 0,1,2 unknown about ("foo, "bar")
+ None,
+ None,
+ None,
+ // container 3,4,5 known to contain only either "foo" and
"bar"
+ Some(true),
+ Some(true),
+ Some(true),
+ // container 6,7,8 known ro contain neither "foo" and
"bar"
+ Some(false),
+ Some(false),
+ Some(false),
+ ],
+ );
+
+ // s1 = 'foo'
+ prune_with_expr(
+ col("s1").eq(lit("foo")),
+ &schema,
+ &statistics,
+ // rule out containers ('false) where we know foo is not present
+ vec![true, false, true, true, false, true, true, false, true],
+ );
+
+ // s1 = 'bar'
+ prune_with_expr(
+ col("s1").eq(lit("bar")),
+ &schema,
+ &statistics,
+ // rule out containers where we know bar is not present
+ vec![true, true, true, false, false, false, true, true, true],
+ );
+
+ // s1 = 'baz' (unknown value)
+ prune_with_expr(
+ col("s1").eq(lit("baz")),
+ &schema,
+ &statistics,
+ // can't rule out anything
+ vec![true, true, true, true, true, true, true, true, true],
+ );
+
+ // s1 = 'foo' AND s1 = 'bar'
+ prune_with_expr(
+ col("s1").eq(lit("foo")).and(col("s1").eq(lit("bar"))),
+ &schema,
+ &statistics,
+ // logically this predicate can't possibly be true (the column
can't
+ // take on both values) but we could rule it out if the stats tell
+ // us that both values are not present
+ vec![true, true, true, true, true, true, true, true, true],
+ );
+
+ // s1 = 'foo' OR s1 = 'bar'
+ prune_with_expr(
+ col("s1").eq(lit("foo")).or(col("s1").eq(lit("bar"))),
+ &schema,
+ &statistics,
+ // can rule out containers that we know contain neither foo nor bar
+ vec![true, true, true, true, true, true, false, false, false],
+ );
+
+ // s1 = 'foo' OR s1 = 'baz'
+ prune_with_expr(
+ col("s1").eq(lit("foo")).or(col("s1").eq(lit("baz"))),
+ &schema,
+ &statistics,
+ // can't rule out anything container
+ vec![true, true, true, true, true, true, true, true, true],
+ );
+
+ // s1 = 'foo' OR s1 = 'bar' OR s1 = 'baz'
+ prune_with_expr(
+ col("s1")
+ .eq(lit("foo"))
+ .or(col("s1").eq(lit("bar")))
+ .or(col("s1").eq(lit("baz"))),
+ &schema,
+ &statistics,
+ // can rule out any containers based on knowledge of s1 and `foo`,
+ // `bar` and (`foo`, `bar`)
+ vec![true, true, true, true, true, true, true, true, true],
+ );
+
+ // s1 != foo
+ prune_with_expr(
+ col("s1").not_eq(lit("foo")),
+ &schema,
+ &statistics,
+ // rule out containers we know for sure only contain foo
+ vec![false, true, true, false, true, true, false, true, true],
+ );
+
+ // s1 != bar
+ prune_with_expr(
+ col("s1").not_eq(lit("bar")),
+ &schema,
+ &statistics,
+ // rule out when we know for sure s1 has the value bar
+ vec![false, false, false, true, true, true, true, true, true],
+ );
+
+ // s1 != foo AND s1 != bar
Review Comment:
At least in this case it has to do with what is known / provided. In this
case, the logic operates on the two conjuncts separately so it consults what it
knows about `s1 and `foo` and what it knows about `s1` and `bar` separately.
In order to reason about `s1 = 'foo' OR s1 = 'bar'` it needs to used what it
knows about `s1` and `{foo, bar}` rather than about them individually
However, in this case I think what would make sense (and probably what
actally happens) is that `!(s1 = 'foo' OR s1 = 'bar')` would be simplified to
`s1 != 'foo' AND s1 != 'bar' at a higher level
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]