Re: [PR] Estimate aggregate output rows using existing NDV statistics [datafusion]

via GitHub Thu, 19 Mar 2026 09:17:46 -0700


asolimando commented on code in PR #20926:
URL: https://github.com/apache/datafusion/pull/20926#discussion_r2959650262



##########
datafusion/physical-plan/src/aggregates/mod.rs:
##########
@@ -4333,6 +4362,157 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_aggregate_stats_ndv_zero_column() -> Result<()> {
+        use crate::test::exec::StatisticsExec;
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Exact(1_000),
+            total_byte_size: Precision::Inexact(1_000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(0),
+                    null_count: Precision::Exact(1_000),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(50),
+                    ..ColumnStatistics::new_unknown()
+                },
+            ],
+        };
+
+        let input = Arc::new(StatisticsExec::new(input_stats, 
(*schema).clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        let agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::new_single(vec![
+                (col("a", &schema)? as Arc<dyn PhysicalExpr>, "a".to_string()),
+                (col("b", &schema)? as Arc<dyn PhysicalExpr>, "b".to_string()),
+            ]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", 
&schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?;
+
+        let stats = agg.partition_statistics(None)?;
+        // NDV(a)=0 with nulls => max(0+1, 1)=1, NDV(b)=50 => 1*50=50
+        assert_eq!(
+            stats.num_rows,
+            Precision::Inexact(50),
+            "all-null column should contribute 1 to the product, not 0"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_stats_absent_num_rows_with_ndv() -> Result<()> {
+        use crate::test::exec::StatisticsExec;
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", 
DataType::Int32, true)]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Absent,
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Exact(100),
+                ..ColumnStatistics::new_unknown()
+            }],
+        };
+
+        let input = Arc::new(StatisticsExec::new(input_stats, 
(*schema).clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        let agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::new_single(vec![(
+                col("a", &schema)? as Arc<dyn PhysicalExpr>,
+                "a".to_string(),
+            )]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", 
&schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?;
+
+        let stats = agg.partition_statistics(None)?;
+        assert_eq!(
+            stats.num_rows,
+            Precision::Inexact(100),
+            "absent num_rows should fall back to NDV estimate"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_stats_absent_num_rows_with_ndv_and_limit() -> Result<()> 
{
+        use crate::test::exec::StatisticsExec;
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", 
DataType::Int32, true)]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Absent,
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Exact(100),
+                ..ColumnStatistics::new_unknown()
+            }],
+        };
+
+        let input = Arc::new(StatisticsExec::new(input_stats, 
(*schema).clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        let mut agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::new_single(vec![(
+                col("a", &schema)? as Arc<dyn PhysicalExpr>,
+                "a".to_string(),
+            )]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", 
&schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?;

Review Comment:
   Nit: the standalone tests all repeat the same 
schema/StatisticsExec/AggregateExec construction. A small helper (e.g., 
`build_test_aggregate(schema, stats, group_by, limit) -> AggregateExec`) would 
reduce the boilerplate.
   
   It's not much about the repeated code, it's more that tests are harder to 
read, and it's a pity as they are pretty meaningful.
   
   Additionally, the three tests that only check `num_rows` (`ndv_zero_column`, 
`absent_num_rows_with_ndv`, `absent_num_rows_with_ndv_and_limit`) could be 
folded into the existing table-driven `test_aggregate_cardinality_estimation`, 
which is very readable.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Estimate aggregate output rows using existing NDV statistics [datafusion]

Reply via email to