[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #3454: Add additional pruning tests with casts, handle unsupported predicates better

GitBox Wed, 14 Sep 2022 11:37:59 -0700


alamb commented on code in PR #3454:
URL: https://github.com/apache/arrow-datafusion/pull/3454#discussion_r971171772



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -168,38 +170,51 @@ impl PruningPredicate {
     /// simplified version `b`. The predicates are simplified via the
     /// ConstantFolding optimizer pass
     pub fn prune<S: PruningStatistics>(&self, statistics: &S) -> 
Result<Vec<bool>> {
-        // build statistics record batch
-        let predicate_array =
-            build_statistics_record_batch(statistics, &self.required_columns)
-                .and_then(|statistics_batch| {
-                    // execute predicate expression
-                    self.predicate_expr.evaluate(&statistics_batch)
-                })
-                .and_then(|v| match v {
-                    ColumnarValue::Array(array) => Ok(array),
-                    ColumnarValue::Scalar(_) => Err(DataFusionError::Internal(
-                        "predicate expression didn't return an 
array".to_string(),
-                    )),
-                })?;
-
-        let predicate_array = predicate_array
-            .as_any()
-            .downcast_ref::<BooleanArray>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(format!(
-                    "Expected pruning predicate evaluation to be BooleanArray, 
\
-                     but was {:?}",
-                    predicate_array
-                ))
-            })?;
-
-        // when the result of the predicate expression for a row group is null 
/ undefined,
-        // e.g. due to missing statistics, this row group can't be filtered 
out,
-        // so replace with true
-        Ok(predicate_array
-            .into_iter()
-            .map(|x| x.unwrap_or(true))
-            .collect::<Vec<_>>())
+        // build a RecordBatch that contains the min/max values in the
+        // appropriate statistics columns
+        let statistics_batch =
+            build_statistics_record_batch(statistics, &self.required_columns)?;
+
+        // Evaluate the pruning predicate on that record batch.
+        //
+        // Use true when the result of evaluating a predicate
+        // expression on a row group is null (aka `None`). Null can
+        // arise when the statistics are unknown or some calculation
+        // in the predicate means we don't know for sure if the row
+        // group can be filtered out or not. To maintain correctness
+        // the row group must be kept and thus `true` is returned.
+        match self.predicate_expr.evaluate(&statistics_batch)? {
+            ColumnarValue::Array(array) => {
+                let predicate_array = array
+                    .as_any()
+                    .downcast_ref::<BooleanArray>()
+                    .ok_or_else(|| {
+                        DataFusionError::Internal(format!(
+                            "Expected pruning predicate evaluation to be 
BooleanArray, \
+                             but was {:?}",
+                            array
+                        ))
+                    })?;
+
+                Ok(predicate_array
+                   .into_iter()
+                   .map(|x| x.unwrap_or(true)) // None -> true per comments 
above
+                   .collect::<Vec<_>>())
+
+            },
+            // result was a column
+            ColumnarValue::Scalar(ScalarValue::Boolean(v)) => {
+                let v = v.unwrap_or(true); // None -> true per comments above

Review Comment:
   Yes -- this code is covered. I verified this by applying this change locally:
   
   ```diff
   --- a/datafusion/core/src/physical_optimizer/pruning.rs
   +++ b/datafusion/core/src/physical_optimizer/pruning.rs
   @@ -205,8 +205,7 @@ impl PruningPredicate {
                },
                // result was a column
                ColumnarValue::Scalar(ScalarValue::Boolean(v)) => {
   -                let v = v.unwrap_or(true); // None -> true per comments 
above
   -                Ok(vec![v; statistics.num_containers()])
   +                unimplemented!();
                }
                other => {
                    Err(DataFusionError::Internal(format!(
   ```
   
   And then running this command
   
   ```shell
   cargo test -p datafusion -- pruning
   ...
   
   failures:
   
   ---- physical_optimizer::pruning::tests::prune_int32_col_lte_zero_cast 
stdout ----
   thread 'physical_optimizer::pruning::tests::prune_int32_col_lte_zero_cast' 
panicked at 'not implemented', 
datafusion/core/src/physical_optimizer/pruning.rs:208:17
   stack backtrace:
      0: rust_begin_unwind
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/std/src/panicking.rs:584:5
      1: core::panicking::panic_fmt
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/panicking.rs:142:14
      2: core::panicking::panic
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/panicking.rs:48:5
      3: datafusion::physical_optimizer::pruning::PruningPredicate::prune
                at ./src/physical_optimizer/pruning.rs:208:17
      4: 
datafusion::physical_optimizer::pruning::tests::prune_int32_col_lte_zero_cast
                at ./src/physical_optimizer/pruning.rs:1963:22
      5: 
datafusion::physical_optimizer::pruning::tests::prune_int32_col_lte_zero_cast::{{closure}}
                at ./src/physical_optimizer/pruning.rs:1949:5
      6: core::ops::function::FnOnce::call_once
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/ops/function.rs:248:5
      7: core::ops::function::FnOnce::call_once
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/ops/function.rs:248:5
   note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose 
backtrace.
   
   ---- physical_optimizer::pruning::tests::prune_int32_col_eq_zero_cast_as_str 
stdout ----
   thread 
'physical_optimizer::pruning::tests::prune_int32_col_eq_zero_cast_as_str' 
panicked at 'not implemented', 
datafusion/core/src/physical_optimizer/pruning.rs:208:17
   stack backtrace:
      0: rust_begin_unwind
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/std/src/panicking.rs:584:5
      1: core::panicking::panic_fmt
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/panicking.rs:142:14
      2: core::panicking::panic
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/panicking.rs:48:5
      3: datafusion::physical_optimizer::pruning::PruningPredicate::prune
                at ./src/physical_optimizer/pruning.rs:208:17
      4: 
datafusion::physical_optimizer::pruning::tests::prune_int32_col_eq_zero_cast_as_str
                at ./src/physical_optimizer/pruning.rs:2046:22
      5: 
datafusion::physical_optimizer::pruning::tests::prune_int32_col_eq_zero_cast_as_str::{{closure}}
                at ./src/physical_optimizer/pruning.rs:2030:5
      6: core::ops::function::FnOnce::call_once
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/ops/function.rs:248:5
      7: core::ops::function::FnOnce::call_once
                at 
/rustc/4b91a6ea7258a947e59c6522cd5898e7c0a6a88f/library/core/src/ops/function.rs:248:5
   note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose 
backtrace.
   
   
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #3454: Add additional pruning tests with casts, handle unsupported predicates better

Reply via email to