waynexia commented on code in PR #8440:
URL: https://github.com/apache/arrow-datafusion/pull/8440#discussion_r1435120353
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -254,9 +295,91 @@ impl PruningPredicate {
is_always_true(&self.predicate_expr)
}
- pub(crate) fn required_columns(&self) -> &RequiredStatColumns {
+ pub(crate) fn required_columns(&self) -> &RequiredColumns {
&self.required_columns
}
+
+ /// Names of the columns that are known to be / not be in a set
+ /// of literals (constants). These are the columns the that may be passed
to
+ /// [`PruningStatistics::contained`] during pruning.
+ ///
+ /// This is useful to avoid fetching statistics for columns that will not
be
+ /// used in the predicate. For example, it can be used to avoid reading
+ /// uneeded bloom filters (a non trivial operation).
+ pub fn literal_columns(&self) -> Vec<String> {
+ let mut seen = HashSet::new();
+ self.literal_guarantees
+ .iter()
+ .map(|e| &e.column.name)
+ // avoid duplicates
+ .filter(|name| seen.insert(*name))
+ .map(|s| s.to_string())
+ .collect()
+ }
+}
+
+/// Builds the return `Vec` for [`PruningPredicate::prune`].
+#[derive(Debug)]
+struct BoolVecBuilder {
+ /// One element per container. Each element is
+ /// * `true`: if the container has row that may pass the predicate
+ /// * `false`: if the container has rows that DEFINITELY DO NOT pass the
predicate
+ inner: Vec<bool>,
+}
+
+impl BoolVecBuilder {
+ /// Create a new `BoolVecBuilder` with `num_containers` elements
+ fn new(num_containers: usize) -> Self {
+ Self {
+ // assume by default all containers may pass the predicate
+ inner: vec![true; num_containers],
+ }
+ }
+
+ /// Combines result `array` for a conjunct (e.g. `AND` clause) of a
+ /// predicate into the currently in progress array.
+ ///
+ /// Each `array` element is:
+ /// * `true`: container has row that may pass the predicate
+ /// * `false`: all container rows DEFINITELY DO NOT pass the predicate
+ /// * `null`: container may or may not have rows that pass the predicate
+ fn append_array(&mut self, array: &BooleanArray) {
+ assert_eq!(array.len(), self.inner.len());
+ for (cur, new) in self.inner.iter_mut().zip(array.iter()) {
+ // `false` for this conjunct means we know for sure no rows could
+ // pass the predicate and thus we set the corresponding container
+ // location to false.
+ if let Some(false) = new {
+ *cur = false;
+ }
+ }
+ }
+
+ /// Combines the results in the [`ColumnarValue`] to the currently in
+ /// progress array, following the same rules as [`Self::append_array`].
+ ///
+ /// # Panics
+ /// If `value` is not boolean
+ fn append_value(&mut self, value: ColumnarValue) {
Review Comment:
A random thought about the naming: "append" sometimes implies "push and
extend", but from the implementation, this method looks closer to "and"(&) the
given boolean array with the existing one.
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -142,12 +167,17 @@ pub trait PruningStatistics {
pub struct PruningPredicate {
/// The input schema against which the predicate will be evaluated
schema: SchemaRef,
- /// Actual pruning predicate (rewritten in terms of column min/max
statistics)
+ /// A min/max pruning predicate (rewritten in terms of column min/max
+ /// values, which are supplied by statistics)
predicate_expr: Arc<dyn PhysicalExpr>,
- /// The statistics required to evaluate this predicate
- required_columns: RequiredStatColumns,
- /// Original physical predicate from which this predicate expr is derived
(required for serialization)
+ /// Description of which statistics are required to evaluate
`predicate_expr`
+ required_columns: RequiredColumns,
+ /// Original physical predicate from which this predicate expr is derived
+ /// (required for serialization)
orig_expr: Arc<dyn PhysicalExpr>,
+ /// [`LiteralGuarantee`]s that are used to try and prove a predicate can
not
+ /// possibly evaluate to `true`.
+ literal_guarantees: Vec<LiteralGuarantee>,
Review Comment:
Those docs are very helpful 👍
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -198,40 +232,47 @@ impl PruningPredicate {
///
/// [`ExprSimplifier`]:
crate::optimizer::simplify_expressions::ExprSimplifier
pub fn prune<S: PruningStatistics>(&self, statistics: &S) ->
Result<Vec<bool>> {
+ let mut builder = BoolVecBuilder::new(statistics.num_containers());
+
+ // Try to prove the predicate can't be true for the containers based on
+ // literal guarantees
+ for literal_guarantee in &self.literal_guarantees {
+ let LiteralGuarantee {
+ column,
+ guarantee,
+ literals,
+ } = literal_guarantee;
+ if let Some(results) = statistics.contained(column, literals) {
+ match guarantee {
+ // `In` means the values in the column must be one of the
+ // values in the set for the predicate to evaluate to true.
+ // If `contained` returns false, that means the column is
+ // not any of the values so we can prune the container
+ Guarantee::In => builder.append_array(&results),
+ // `NotIn` means the values in the column must must not be
+ // any of the values in the set for the predicate to
+ // evaluate to true. If contained returns true, it means
the
+ // column is only in the set of values so we can prune the
+ // container
+ Guarantee::NotIn => {
+ builder.append_array(&arrow::compute::not(&results)?)
+ }
+ }
+ }
+ }
+
+ // Next, try to prove the predicate can't be true for the containers
based
+ // on min/max values
Review Comment:
I realize min/max has the same confidence with bloom filter and guarantee.
Thinking this way might make it easier to verify `Guarantee`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]