alamb commented on code in PR #8440:
URL: https://github.com/apache/arrow-datafusion/pull/8440#discussion_r1417686897
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -276,21 +383,21 @@ fn is_always_true(expr: &Arc<dyn PhysicalExpr>) -> bool {
/// Handles creating references to the min/max statistics
/// for columns as well as recording which statistics are needed
#[derive(Debug, Default, Clone)]
-pub(crate) struct RequiredStatColumns {
+pub(crate) struct RequiredColumns {
Review Comment:
I renamed this to be more specific and since it is `crate` private it is not
a breaking API change
One thing I did try was encoding the columns needed for literal guarantees
in this structure, but I found the code was very specific to `min/max/count`
statistics
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -93,6 +94,27 @@ pub trait PruningStatistics {
///
/// Note: the returned array must contain [`Self::num_containers`] rows
fn null_counts(&self, column: &Column) -> Option<ArrayRef>;
+
+ /// Returns an array where each row represents if the value of the
+ /// column is known to contain or not contain any of the set of `values`.
+ ///
+ /// This is used to prune containers using structures such as Bloom
+ /// Filters which can quickly test set membership.
+ ///
+ /// The returned array has one row for each container, with the following:
+ /// * `true` if the value of column CERTAINLY IS one of `values`
+ /// * `false` if the value of column CERTAINLY IS NOT one of `values`
+ /// * `null` if the value of column may or may not be in values
+ ///
+ /// If these statistics can not determine column membership for any
+ /// container, return `None` (the default).
+ ///
+ /// Note: the returned array must contain [`Self::num_containers`] rows
+ fn contains(
Review Comment:
This is the new API -- it is slightly different than the proposal because it
takes a `HashSet` rather than a single value, which is necessary to support `x
IN (....)` type predicates
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -344,11 +451,10 @@ impl RequiredStatColumns {
// only add statistics column if not previously added
if need_to_insert {
- let stat_field = Field::new(
- stat_column.name(),
- field.data_type().clone(),
- field.is_nullable(),
- );
+ // may be null if statistics are not present
Review Comment:
A non-nullable column may appear as NULL in the statistics if the min or max
values are not known
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -993,95 +1099,139 @@ mod tests {
///
/// Note All `ArrayRefs` must be the same size.
struct ContainerStats {
- min: ArrayRef,
- max: ArrayRef,
+ min: Option<ArrayRef>,
+ max: Option<ArrayRef>,
/// Optional values
null_counts: Option<ArrayRef>,
+ /// Optional known values (e.g. mimic a bloom filter)
+ /// (value, contains)
+ /// If present, all BooleanArrays must be the same size as min/max
+ contains: Vec<(HashSet<ScalarValue>, BooleanArray)>,
}
impl ContainerStats {
+ fn new() -> Self {
+ Default::default()
+ }
fn new_decimal128(
min: impl IntoIterator<Item = Option<i128>>,
max: impl IntoIterator<Item = Option<i128>>,
precision: u8,
scale: i8,
) -> Self {
- Self {
- min: Arc::new(
+ Self::new()
+ .with_min(Arc::new(
min.into_iter()
.collect::<Decimal128Array>()
.with_precision_and_scale(precision, scale)
.unwrap(),
- ),
- max: Arc::new(
+ ))
+ .with_max(Arc::new(
max.into_iter()
.collect::<Decimal128Array>()
.with_precision_and_scale(precision, scale)
.unwrap(),
- ),
- null_counts: None,
- }
+ ))
}
fn new_i64(
min: impl IntoIterator<Item = Option<i64>>,
max: impl IntoIterator<Item = Option<i64>>,
) -> Self {
- Self {
- min: Arc::new(min.into_iter().collect::<Int64Array>()),
- max: Arc::new(max.into_iter().collect::<Int64Array>()),
- null_counts: None,
- }
+ Self::new()
+ .with_min(Arc::new(min.into_iter().collect::<Int64Array>()))
+ .with_max(Arc::new(max.into_iter().collect::<Int64Array>()))
}
fn new_i32(
min: impl IntoIterator<Item = Option<i32>>,
max: impl IntoIterator<Item = Option<i32>>,
) -> Self {
- Self {
- min: Arc::new(min.into_iter().collect::<Int32Array>()),
- max: Arc::new(max.into_iter().collect::<Int32Array>()),
- null_counts: None,
- }
+ Self::new()
+ .with_min(Arc::new(min.into_iter().collect::<Int32Array>()))
+ .with_max(Arc::new(max.into_iter().collect::<Int32Array>()))
}
fn new_utf8<'a>(
min: impl IntoIterator<Item = Option<&'a str>>,
max: impl IntoIterator<Item = Option<&'a str>>,
) -> Self {
- Self {
- min: Arc::new(min.into_iter().collect::<StringArray>()),
- max: Arc::new(max.into_iter().collect::<StringArray>()),
- null_counts: None,
- }
+ Self::new()
+ .with_min(Arc::new(min.into_iter().collect::<StringArray>()))
+ .with_max(Arc::new(max.into_iter().collect::<StringArray>()))
}
fn new_bool(
min: impl IntoIterator<Item = Option<bool>>,
max: impl IntoIterator<Item = Option<bool>>,
) -> Self {
- Self {
- min: Arc::new(min.into_iter().collect::<BooleanArray>()),
- max: Arc::new(max.into_iter().collect::<BooleanArray>()),
- null_counts: None,
- }
+ Self::new()
+ .with_min(Arc::new(min.into_iter().collect::<BooleanArray>()))
+ .with_max(Arc::new(max.into_iter().collect::<BooleanArray>()))
}
fn min(&self) -> Option<ArrayRef> {
- Some(self.min.clone())
+ self.min.clone()
}
fn max(&self) -> Option<ArrayRef> {
- Some(self.max.clone())
+ self.max.clone()
}
fn null_counts(&self) -> Option<ArrayRef> {
self.null_counts.clone()
}
+ /// return an iterator over all arrays in this statistics
+ fn arrays(&self) -> Vec<ArrayRef> {
Review Comment:
this is testing infrastructure
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1385,20 +1596,14 @@ mod tests {
// test column on the left
let expr = col("c1").eq(lit(1));
- let predicate_expr = test_build_predicate_expression(
- &expr,
- &schema,
- &mut RequiredStatColumns::new(),
- );
+ let predicate_expr =
Review Comment:
this is just reformatting resulting in a shorter name for
`RequiredStatColumns`
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -2484,10 +2614,376 @@ mod tests {
// TODO: add other negative test for other case and op
}
+ #[test]
+ fn prune_with_contains_one_column() {
Review Comment:
I spent quite a while on these tests and I think they are pretty thorough
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]