waynexia commented on code in PR #8440:
URL: https://github.com/apache/arrow-datafusion/pull/8440#discussion_r1435120353


##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -254,9 +295,91 @@ impl PruningPredicate {
         is_always_true(&self.predicate_expr)
     }
 
-    pub(crate) fn required_columns(&self) -> &RequiredStatColumns {
+    pub(crate) fn required_columns(&self) -> &RequiredColumns {
         &self.required_columns
     }
+
+    /// Names of the columns that are known to be / not be in a set
+    /// of literals (constants). These are the columns the that may be passed 
to
+    /// [`PruningStatistics::contained`] during pruning.
+    ///
+    /// This is useful to avoid fetching statistics for columns that will not 
be
+    /// used in the predicate. For example, it can be used to avoid reading
+    /// uneeded bloom filters (a non trivial operation).
+    pub fn literal_columns(&self) -> Vec<String> {
+        let mut seen = HashSet::new();
+        self.literal_guarantees
+            .iter()
+            .map(|e| &e.column.name)
+            // avoid duplicates
+            .filter(|name| seen.insert(*name))
+            .map(|s| s.to_string())
+            .collect()
+    }
+}
+
+/// Builds the return `Vec` for [`PruningPredicate::prune`].
+#[derive(Debug)]
+struct BoolVecBuilder {
+    /// One element per container. Each element is
+    /// * `true`: if the container has row that may pass the predicate
+    /// * `false`: if the container has rows that DEFINITELY DO NOT pass the 
predicate
+    inner: Vec<bool>,
+}
+
+impl BoolVecBuilder {
+    /// Create a new `BoolVecBuilder` with `num_containers` elements
+    fn new(num_containers: usize) -> Self {
+        Self {
+            // assume by default all containers may pass the predicate
+            inner: vec![true; num_containers],
+        }
+    }
+
+    /// Combines result `array` for a  conjunct (e.g. `AND` clause) of a
+    /// predicate into the currently in progress array.
+    ///
+    /// Each `array` element is:
+    /// * `true`: container has row that may pass the predicate
+    /// * `false`: all container rows DEFINITELY DO NOT pass the predicate
+    /// * `null`: container may or may not have rows that pass the predicate
+    fn append_array(&mut self, array: &BooleanArray) {
+        assert_eq!(array.len(), self.inner.len());
+        for (cur, new) in self.inner.iter_mut().zip(array.iter()) {
+            // `false` for this conjunct means we know for sure no rows could
+            // pass the predicate and thus we set the corresponding container
+            // location to false.
+            if let Some(false) = new {
+                *cur = false;
+            }
+        }
+    }
+
+    /// Combines the results in the [`ColumnarValue`] to the currently in
+    /// progress array, following the same rules as [`Self::append_array`].
+    ///
+    /// # Panics
+    /// If `value` is not boolean
+    fn append_value(&mut self, value: ColumnarValue) {

Review Comment:
   A random thought about the naming: "append" sometimes implies "push and 
extend", but from the implementation, this method looks closer to "and"(&) the 
given boolean array with the existing one.



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -142,12 +167,17 @@ pub trait PruningStatistics {
 pub struct PruningPredicate {
     /// The input schema against which the predicate will be evaluated
     schema: SchemaRef,
-    /// Actual pruning predicate (rewritten in terms of column min/max 
statistics)
+    /// A min/max pruning predicate (rewritten in terms of column min/max
+    /// values, which are supplied by statistics)
     predicate_expr: Arc<dyn PhysicalExpr>,
-    /// The statistics required to evaluate this predicate
-    required_columns: RequiredStatColumns,
-    /// Original physical predicate from which this predicate expr is derived 
(required for serialization)
+    /// Description of which statistics are required to evaluate 
`predicate_expr`
+    required_columns: RequiredColumns,
+    /// Original physical predicate from which this predicate expr is derived
+    /// (required for serialization)
     orig_expr: Arc<dyn PhysicalExpr>,
+    /// [`LiteralGuarantee`]s that are used to try and prove a predicate can 
not
+    /// possibly evaluate to `true`.
+    literal_guarantees: Vec<LiteralGuarantee>,

Review Comment:
   Those docs are very helpful 👍 



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -198,40 +232,47 @@ impl PruningPredicate {
     ///
     /// [`ExprSimplifier`]: 
crate::optimizer::simplify_expressions::ExprSimplifier
     pub fn prune<S: PruningStatistics>(&self, statistics: &S) -> 
Result<Vec<bool>> {
+        let mut builder = BoolVecBuilder::new(statistics.num_containers());
+
+        // Try to prove the predicate can't be true for the containers based on
+        // literal guarantees
+        for literal_guarantee in &self.literal_guarantees {
+            let LiteralGuarantee {
+                column,
+                guarantee,
+                literals,
+            } = literal_guarantee;
+            if let Some(results) = statistics.contained(column, literals) {
+                match guarantee {
+                    // `In` means the values in the column must be one of the
+                    // values in the set for the predicate to evaluate to true.
+                    // If `contained` returns false, that means the column is
+                    // not any of the values so we can prune the container
+                    Guarantee::In => builder.append_array(&results),
+                    // `NotIn` means the values in the column must must not be
+                    // any of the values in the set for the predicate to
+                    // evaluate to true. If contained returns true, it means 
the
+                    // column is only in the set of values so we can prune the
+                    // container
+                    Guarantee::NotIn => {
+                        builder.append_array(&arrow::compute::not(&results)?)
+                    }
+                }
+            }
+        }
+
+        // Next, try to prove the predicate can't be true for the containers 
based
+        // on min/max values

Review Comment:
   I realize min/max has the same confidence with bloom filter and guarantee. 
Thinking this way might make it easier to verify `Guarantee`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to