alamb commented on code in PR #3454:
URL: https://github.com/apache/arrow-datafusion/pull/3454#discussion_r971171772
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -168,38 +170,51 @@ impl PruningPredicate {
/// simplified version `b`. The predicates are simplified via the
/// ConstantFolding optimizer pass
pub fn prune<S: PruningStatistics>(&self, statistics: &S) ->
Result<Vec<bool>> {
- // build statistics record batch
- let predicate_array =
- build_statistics_record_batch(statistics, &self.required_columns)
- .and_then(|statistics_batch| {
- // execute predicate expression
- self.predicate_expr.evaluate(&statistics_batch)
- })
- .and_then(|v| match v {
- ColumnarValue::Array(array) => Ok(array),
- ColumnarValue::Scalar(_) => Err(DataFusionError::Internal(
- "predicate expression didn't return an
array".to_string(),
- )),
- })?;
-
- let predicate_array = predicate_array
- .as_any()
- .downcast_ref::<BooleanArray>()
- .ok_or_else(|| {
- DataFusionError::Internal(format!(
- "Expected pruning predicate evaluation to be BooleanArray,
\
- but was {:?}",
- predicate_array
- ))
- })?;
-
- // when the result of the predicate expression for a row group is null
/ undefined,
- // e.g. due to missing statistics, this row group can't be filtered
out,
- // so replace with true
- Ok(predicate_array
- .into_iter()
- .map(|x| x.unwrap_or(true))
- .collect::<Vec<_>>())
+ // build a RecordBatch that contains the min/max values in the
+ // appropriate statistics columns
+ let statistics_batch =
+ build_statistics_record_batch(statistics, &self.required_columns)?;
+
+ // Evaluate the pruning predicate on that record batch.
+ //
+ // Use true when the result of evaluating a predicate
+ // expression on a row group is null (aka `None`). Null can
+ // arise when the statistics are unknown or some calculation
+ // in the predicate means we don't know for sure if the row
+ // group can be filtered out or not. To maintain correctness
+ // the row group must be kept and thus `true` is returned.
+ match self.predicate_expr.evaluate(&statistics_batch)? {
+ ColumnarValue::Array(array) => {
+ let predicate_array = array
+ .as_any()
+ .downcast_ref::<BooleanArray>()
+ .ok_or_else(|| {
+ DataFusionError::Internal(format!(
+ "Expected pruning predicate evaluation to be
BooleanArray, \
+ but was {:?}",
+ array
+ ))
+ })?;
+
+ Ok(predicate_array
+ .into_iter()
+ .map(|x| x.unwrap_or(true)) // None -> true per comments
above
+ .collect::<Vec<_>>())
+
+ },
+ // result was a column
+ ColumnarValue::Scalar(ScalarValue::Boolean(v)) => {
+ let v = v.unwrap_or(true); // None -> true per comments above
Review Comment:
Yes -- this code is covered. I verified this by applying this change locally:
```diff
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -205,8 +205,7 @@ impl PruningPredicate {
},
// result was a column
ColumnarValue::Scalar(ScalarValue::Boolean(v)) => {
- let v = v.unwrap_or(true); // None -> true per comments
above
- Ok(vec![v; statistics.num_containers()])
+ unimplemented!();
}
other => {
Err(DataFusionError::Internal(format!(
```
And then running this command
```shell
cargo test -p datafusion -- pruning
...
failures:
---- physical_optimizer::pruning::tests::prune_int32_col_lte_zero_cast
stdout ----
thread 'physical_optimizer::pruning::tests::prune_int32_col_lte_zero_cast'
panicked at 'not implemented',
datafusion/core/src/physical_optimizer/pruning.rs:208:17
stack backtrace:
0: rust_begin_unwind
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/panicking.rs:142:14
2: core::panicking::panic
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/panicking.rs:48:5
3: datafusion::physical_optimizer::pruning::PruningPredicate::prune
at ./src/physical_optimizer/pruning.rs:208:17
4:
datafusion::physical_optimizer::pruning::tests::prune_int32_col_lte_zero_cast
at ./src/physical_optimizer/pruning.rs:1963:22
5:
datafusion::physical_optimizer::pruning::tests::prune_int32_col_lte_zero_cast::{{closure}}
at ./src/physical_optimizer/pruning.rs:1949:5
6: core::ops::function::FnOnce::call_once
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/ops/function.rs:248:5
7: core::ops::function::FnOnce::call_once
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/ops/function.rs:248:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose
backtrace.
---- physical_optimizer::pruning::tests::prune_int32_col_eq_zero_cast_as_str
stdout ----
thread
'physical_optimizer::pruning::tests::prune_int32_col_eq_zero_cast_as_str'
panicked at 'not implemented',
datafusion/core/src/physical_optimizer/pruning.rs:208:17
stack backtrace:
0: rust_begin_unwind
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/std/src/panicking.rs:584:5
1: core::panicking::panic_fmt
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/panicking.rs:142:14
2: core::panicking::panic
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/panicking.rs:48:5
3: datafusion::physical_optimizer::pruning::PruningPredicate::prune
at ./src/physical_optimizer/pruning.rs:208:17
4:
datafusion::physical_optimizer::pruning::tests::prune_int32_col_eq_zero_cast_as_str
at ./src/physical_optimizer/pruning.rs:2046:22
5:
datafusion::physical_optimizer::pruning::tests::prune_int32_col_eq_zero_cast_as_str::{{closure}}
at ./src/physical_optimizer/pruning.rs:2030:5
6: core::ops::function::FnOnce::call_once
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/ops/function.rs:248:5
7: core::ops::function::FnOnce::call_once
at
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/ops/function.rs:248:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose
backtrace.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]