isidentical commented on code in PR #4162:
URL: https://github.com/apache/arrow-datafusion/pull/4162#discussion_r1020611956
##########
datafusion/core/src/physical_plan/filter.rs:
##########
@@ -380,4 +403,108 @@ mod tests {
Ok(())
}
+
+ #[tokio::test]
+ async fn test_filter_statistics_basic_expr() -> Result<()> {
+ // Table:
+ // a: min=1, max=100
+ let schema = Schema::new(vec![Field::new("a", DataType::Int32,
false)]);
+ let input = Arc::new(StatisticsExec::new(
+ Statistics {
+ num_rows: Some(100),
+ column_statistics: Some(vec![ColumnStatistics {
+ min_value: Some(ScalarValue::Int32(Some(1))),
+ max_value: Some(ScalarValue::Int32(Some(100))),
+ ..Default::default()
+ }]),
+ ..Default::default()
+ },
+ schema.clone(),
+ ));
+
+ // a <= 25
+ let predicate: Arc<dyn PhysicalExpr> =
+ binary(col("a", &schema)?, Operator::LtEq, lit(25i32), &schema)?;
+
+ // WHERE a <= 25
+ let filter: Arc<dyn ExecutionPlan> =
+ Arc::new(FilterExec::try_new(predicate, input)?);
+
+ let statistics = filter.statistics();
+ assert_eq!(statistics.num_rows, Some(25));
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ #[ignore]
+ // This test requires propagation of column boundaries from the comparison
analysis
+ // to the analysis context. This is not yet implemented.
+ async fn test_filter_statistics_column_level_basic_expr() -> Result<()> {
Review Comment:
@alamb while working on this, I've noticed the initial application of
propagation of new column limits. Since we don't have an API to represent
changes to the boundaries during an expression's analysis (like `a` becomes
`[1, 25]` in the example below) we can't generate the `column_statistics` which
is essentially rendering nested join optimizations unusable (and potentially
any other analysis that needs column level stats).
This doesn't mean it is completely ineffecttive as is, since we can at least
find the cardinality of filter itself and do the local filter <-> table switch
in the case below. But I think it might make sense to at least investigate
potential ways to deal with this.

--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]