alamb commented on code in PR #4519:
URL: https://github.com/apache/arrow-datafusion/pull/4519#discussion_r1042075509
##########
datafusion/core/src/physical_plan/filter.rs:
##########
@@ -502,6 +523,52 @@ mod tests {
Ok(())
}
+ #[tokio::test]
+ async fn test_filter_statistics_column_level_nested() -> Result<()> {
+ // Table:
+ // a: min=1, max=100
+ let schema = Schema::new(vec![Field::new("a", DataType::Int32,
false)]);
+ let input = Arc::new(StatisticsExec::new(
+ Statistics {
+ num_rows: Some(100),
+ column_statistics: Some(vec![ColumnStatistics {
+ min_value: Some(ScalarValue::Int32(Some(1))),
+ max_value: Some(ScalarValue::Int32(Some(100))),
+ ..Default::default()
+ }]),
+ ..Default::default()
+ },
+ schema.clone(),
+ ));
+
+ // WHERE a <= 25
+ let sub_filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
+ binary(col("a", &schema)?, Operator::LtEq, lit(25i32), &schema)?,
+ input,
+ )?);
+
+ // Nested filters (two separate physical plans, instead of AND chain
in the expr)
+ // WHERE a >= 10
+ // WHERE a <= 25
+ let filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
+ binary(col("a", &schema)?, Operator::GtEq, lit(10i32), &schema)?,
+ sub_filter,
+ )?);
+
+ let statistics = filter.statistics();
+ assert_eq!(statistics.num_rows, Some(16));
Review Comment:
👍 Nice test / demonstration
##########
datafusion/core/src/physical_plan/filter.rs:
##########
@@ -502,6 +523,52 @@ mod tests {
Ok(())
}
+ #[tokio::test]
+ async fn test_filter_statistics_column_level_nested() -> Result<()> {
+ // Table:
+ // a: min=1, max=100
+ let schema = Schema::new(vec![Field::new("a", DataType::Int32,
false)]);
+ let input = Arc::new(StatisticsExec::new(
+ Statistics {
+ num_rows: Some(100),
+ column_statistics: Some(vec![ColumnStatistics {
+ min_value: Some(ScalarValue::Int32(Some(1))),
+ max_value: Some(ScalarValue::Int32(Some(100))),
+ ..Default::default()
+ }]),
+ ..Default::default()
+ },
+ schema.clone(),
+ ));
+
+ // WHERE a <= 25
+ let sub_filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
+ binary(col("a", &schema)?, Operator::LtEq, lit(25i32), &schema)?,
+ input,
+ )?);
+
+ // Nested filters (two separate physical plans, instead of AND chain
in the expr)
+ // WHERE a >= 10
+ // WHERE a <= 25
+ let filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
+ binary(col("a", &schema)?, Operator::GtEq, lit(10i32), &schema)?,
+ sub_filter,
+ )?);
+
+ let statistics = filter.statistics();
+ assert_eq!(statistics.num_rows, Some(16));
+ assert_eq!(
+ statistics.column_statistics,
+ Some(vec![ColumnStatistics {
+ min_value: Some(ScalarValue::Int32(Some(10))),
+ max_value: Some(ScalarValue::Int32(Some(25))),
+ ..Default::default()
+ }])
+ );
+
+ Ok(())
+ }
+
Review Comment:
What would you think about adding a test showing multiple column boundaries,
like
```
Filter (a <= 25)
Filter( b > 5)
Filter(a <= 20)
```
?
##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -3178,13 +3253,16 @@ mod tests {
((Operator::GtEq, 50.7), (1.0 / distance, 50.7, 50.7)),
];
- for ((operator, rhs), (exp_selectivity, _, _)) in cases {
+ for ((operator, rhs), (exp_selectivity, exp_min, exp_max)) in cases {
let context = AnalysisContext::from_statistics(&schema,
&statistics);
let left = col("a", &schema).unwrap();
let right = ScalarValue::from(rhs);
- let boundaries =
- analyze_expr_scalar_comparison(&operator, &context, &left,
right)
- .expect("this case should not return None");
+ let analysis_ctx =
+ analyze_expr_scalar_comparison(context, &operator, &left,
right);
+ let boundaries = analysis_ctx
+ .clone()
+ .boundaries
Review Comment:
It doesn't matter for the test, but you might be able to avoid this clone
with something like
```suggestion
let boundaries = analysis_ctx
.boundaries
.as_ref()
```
##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -704,12 +721,12 @@ impl PartialEq<dyn Any> for BinaryExpr {
// (on the right). The new boundaries will indicate whether it is always true,
always
// false, or unknown (with a probablistic selectivity value attached).
Review Comment:
I wonder if these docstrings could/should be updated based on the changed
signature
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]