alamb commented on code in PR #8440:
URL: https://github.com/apache/arrow-datafusion/pull/8440#discussion_r1431511348


##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -993,95 +1102,139 @@ mod tests {
     ///
     /// Note All `ArrayRefs` must be the same size.
     struct ContainerStats {
-        min: ArrayRef,
-        max: ArrayRef,
+        min: Option<ArrayRef>,

Review Comment:
   I modified this fixture to support different combinations of 
min/max/contained



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -2484,10 +2617,376 @@ mod tests {
         // TODO: add other negative test for other case and op
     }
 
+    #[test]

Review Comment:
   The new tests start here



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -2484,10 +2630,466 @@ mod tests {
         // TODO: add other negative test for other case and op
     }
 
+    #[test]
+    fn prune_with_contained_one_column() {
+        let schema = Arc::new(Schema::new(vec![Field::new("s1", 
DataType::Utf8, true)]));
+
+        // Model having information like a bloom filter for s1
+        let statistics = TestStatistics::new()
+            .with_contained(
+                "s1",
+                [ScalarValue::from("foo")],
+                [
+                    // container 0 known to only contain "foo"",
+                    Some(true),
+                    // container 1 known to not contain "foo"
+                    Some(false),
+                    // container 2 unknown about "foo"
+                    None,
+                    // container 3 known to only contain "foo"
+                    Some(true),
+                    // container 4 known to not contain "foo"
+                    Some(false),
+                    // container 5 unknown about "foo"
+                    None,
+                    // container 6 known to only contain "foo"
+                    Some(true),
+                    // container 7 known to not contain "foo"
+                    Some(false),
+                    // container 8 unknown about "foo"
+                    None,
+                ],
+            )
+            .with_contained(
+                "s1",
+                [ScalarValue::from("bar")],
+                [
+                    // containers 0,1,2 known to only contain "bar"
+                    Some(true),
+                    Some(true),
+                    Some(true),
+                    // container 3,4,5 known to not contain "bar"
+                    Some(false),
+                    Some(false),
+                    Some(false),
+                    // container 6,7,8 unknown about "bar"
+                    None,
+                    None,
+                    None,
+                ],
+            )
+            .with_contained(
+                // the way the tests are setup, this data is
+                // consulted if the "foo" and "bar" are being checked at the 
same time
+                "s1",
+                [ScalarValue::from("foo"), ScalarValue::from("bar")],
+                [
+                    // container 0,1,2 unknown about ("foo, "bar")
+                    None,
+                    None,
+                    None,
+                    // container 3,4,5 known to contain only either "foo" and 
"bar"
+                    Some(true),
+                    Some(true),
+                    Some(true),
+                    // container 6,7,8  known ro contain  neither "foo" and 
"bar"
+                    Some(false),
+                    Some(false),
+                    Some(false),
+                ],
+            );
+
+        // s1 = 'foo'
+        prune_with_expr(
+            col("s1").eq(lit("foo")),
+            &schema,
+            &statistics,
+            // rule out containers ('false) where we know foo is not present
+            vec![true, false, true, true, false, true, true, false, true],
+        );
+
+        // s1 = 'bar'
+        prune_with_expr(
+            col("s1").eq(lit("bar")),
+            &schema,
+            &statistics,
+            // rule out containers where we know bar is not present
+            vec![true, true, true, false, false, false, true, true, true],
+        );
+
+        // s1 = 'baz' (unknown value)
+        prune_with_expr(
+            col("s1").eq(lit("baz")),
+            &schema,
+            &statistics,
+            // can't rule out anything
+            vec![true, true, true, true, true, true, true, true, true],
+        );
+
+        // s1 = 'foo' AND s1 = 'bar'
+        prune_with_expr(
+            col("s1").eq(lit("foo")).and(col("s1").eq(lit("bar"))),
+            &schema,
+            &statistics,
+            // logically this predicate can't possibly be true (the column 
can't
+            // take on both values) but we could rule it out if the stats tell
+            // us that both values are not present
+            vec![true, true, true, true, true, true, true, true, true],

Review Comment:
   Yes, you can say that by returning `false` for `contained("s1", {foo, bar})`
   
   However, in this case I think what happens is we end up with two distinct 
literal guarantees and the container would have to know that a container only 
had foo AND only had bar, which is logically impossible. 
   
   So in other words, this expression
   ```
   Pruning with expr: s1 != Utf8("foo") AND s2 != Utf8("bar")
   ```
   
   Generates these guarantees:
   ```
      Got guarantees: [
       LiteralGuarantee { column: Column { relation: None, name: "s1" }, 
guarantee: NotIn, literals: {Utf8("foo")} },
       LiteralGuarantee { column: Column { relation: None, name: "s2" }, 
guarantee: NotIn, literals: {Utf8("bar")} }
     ]
   ```
   
   I think it would be possiible to do another round of analysis on this and 
prove this can never be true. I am not sure how important the use case is 
however.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to