Copilot commented on code in PR #19419:
URL: https://github.com/apache/datafusion/pull/19419#discussion_r2636892055
##########
datafusion/physical-expr/src/projection.rs:
##########
@@ -2593,4 +2625,68 @@ pub(crate) mod tests {
Ok(())
}
+
+ #[test]
+ fn test_project_statistics_with_literal() -> Result<()> {
+ let input_stats = get_stats();
+ let input_schema = get_schema();
+
+ // Projection with literal: SELECT 42 AS constant, col0 AS num
+ let projection = ProjectionExprs::new(vec![
+ ProjectionExpr {
+ expr: Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
+ alias: "constant".to_string(),
+ },
+ ProjectionExpr {
+ expr: Arc::new(Column::new("col0", 0)),
+ alias: "num".to_string(),
+ },
+ ]);
+
+ let output_stats = projection.project_statistics(
+ input_stats,
+ &projection.project_schema(&input_schema)?,
+ )?;
+
+ // Row count should be preserved
+ assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+ // Should have 2 column statistics
+ assert_eq!(output_stats.column_statistics.len(), 2);
+
+ // First column (literal 42) should have proper constant statistics
+ assert_eq!(
+ output_stats.column_statistics[0].min_value,
+ Precision::Exact(ScalarValue::Int64(Some(42)))
+ );
+ assert_eq!(
+ output_stats.column_statistics[0].max_value,
+ Precision::Exact(ScalarValue::Int64(Some(42)))
+ );
+ assert_eq!(
+ output_stats.column_statistics[0].distinct_count,
+ Precision::Exact(1)
+ );
+ assert_eq!(
+ output_stats.column_statistics[0].null_count,
+ Precision::Exact(0)
+ );
+ // Int64 is 8 bytes, 5 rows = 40 bytes
+ assert_eq!(
+ output_stats.column_statistics[0].byte_size,
+ Precision::Exact(40)
+ );
Review Comment:
The test should validate the sum_value field for the literal column. While
setting sum_value to Precision::Absent is a reasonable choice (as mentioned in
the comment on line 619), the test should explicitly assert this to ensure the
behavior is documented and verified.
##########
datafusion/physical-expr/src/projection.rs:
##########
@@ -587,6 +587,38 @@ impl ProjectionExprs {
let expr = &proj_expr.expr;
let col_stats = if let Some(col) =
expr.as_any().downcast_ref::<Column>() {
std::mem::take(&mut stats.column_statistics[col.index()])
+ } else if let Some(literal) =
expr.as_any().downcast_ref::<Literal>()
+ && !literal.value().is_null()
+ {
+ // For constant columns (non-null literals), output proper
statistics
+ let value = literal.value();
+ let data_type = expr.data_type(output_schema)?;
+
+ // For constant columns:
+ // - min_value = max_value = the literal value
+ // - distinct_count = 1
+ // - null_count = 0
+ // - byte_size = calculated from data type and num_rows
+ let distinct_count = Precision::Exact(1);
+ let null_count = Precision::Exact(0);
+
+ // Calculate byte_size: for primitive types, use width *
num_rows
+ let byte_size = if let Some(byte_width) =
data_type.primitive_width() {
+ stats.num_rows.multiply(&Precision::Exact(byte_width))
+ } else {
+ // For complex types (Utf8, List, etc.), we can't
calculate exact size
+ // without knowing the actual data, so leave it as Absent
Review Comment:
The comment says "we can't calculate exact size without knowing the actual
data", but we do know the actual data (it's in the value variable). However,
the issue is that for complex types like Utf8, the byte_size when materialized
as an array depends on the array encoding (e.g., dictionary encoding, string
view arrays), which makes exact calculation complex. Consider clarifying the
comment to say "For complex types, byte_size depends on array encoding and
representation, so we conservatively set it to Absent" to better explain the
rationale.
```suggestion
// For complex types (Utf8, List, etc.), the byte_size
when materialized
// as an array depends on the array encoding and
representation, so we
// conservatively set it to Absent
```
##########
datafusion/physical-expr/src/projection.rs:
##########
@@ -2593,4 +2625,68 @@ pub(crate) mod tests {
Ok(())
}
+
+ #[test]
+ fn test_project_statistics_with_literal() -> Result<()> {
+ let input_stats = get_stats();
+ let input_schema = get_schema();
+
+ // Projection with literal: SELECT 42 AS constant, col0 AS num
+ let projection = ProjectionExprs::new(vec![
+ ProjectionExpr {
+ expr: Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
+ alias: "constant".to_string(),
+ },
+ ProjectionExpr {
+ expr: Arc::new(Column::new("col0", 0)),
+ alias: "num".to_string(),
+ },
+ ]);
+
+ let output_stats = projection.project_statistics(
+ input_stats,
+ &projection.project_schema(&input_schema)?,
+ )?;
+
+ // Row count should be preserved
+ assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+ // Should have 2 column statistics
+ assert_eq!(output_stats.column_statistics.len(), 2);
+
+ // First column (literal 42) should have proper constant statistics
+ assert_eq!(
+ output_stats.column_statistics[0].min_value,
+ Precision::Exact(ScalarValue::Int64(Some(42)))
+ );
+ assert_eq!(
+ output_stats.column_statistics[0].max_value,
+ Precision::Exact(ScalarValue::Int64(Some(42)))
+ );
+ assert_eq!(
+ output_stats.column_statistics[0].distinct_count,
+ Precision::Exact(1)
+ );
+ assert_eq!(
+ output_stats.column_statistics[0].null_count,
+ Precision::Exact(0)
+ );
+ // Int64 is 8 bytes, 5 rows = 40 bytes
+ assert_eq!(
+ output_stats.column_statistics[0].byte_size,
+ Precision::Exact(40)
+ );
+
+ // Second column (col0) should preserve statistics
+ assert_eq!(
+ output_stats.column_statistics[1].distinct_count,
+ Precision::Exact(5)
+ );
+ assert_eq!(
+ output_stats.column_statistics[1].max_value,
+ Precision::Exact(ScalarValue::Int64(Some(21)))
+ );
+
+ Ok(())
+ }
Review Comment:
The test only covers non-null literals with primitive types (Int64).
Consider adding test cases for:
1. Null literals - these should return ColumnStatistics::new_unknown() as
per line 591 logic
2. Complex type literals (e.g., Utf8) - these should have byte_size set to
Precision::Absent as per line 611
These edge cases are important to verify the complete behavior of the
literal statistics calculation.
##########
datafusion/physical-expr/src/projection.rs:
##########
@@ -587,6 +587,38 @@ impl ProjectionExprs {
let expr = &proj_expr.expr;
let col_stats = if let Some(col) =
expr.as_any().downcast_ref::<Column>() {
std::mem::take(&mut stats.column_statistics[col.index()])
+ } else if let Some(literal) =
expr.as_any().downcast_ref::<Literal>()
+ && !literal.value().is_null()
+ {
+ // For constant columns (non-null literals), output proper
statistics
+ let value = literal.value();
+ let data_type = expr.data_type(output_schema)?;
+
+ // For constant columns:
+ // - min_value = max_value = the literal value
+ // - distinct_count = 1
+ // - null_count = 0
+ // - byte_size = calculated from data type and num_rows
+ let distinct_count = Precision::Exact(1);
+ let null_count = Precision::Exact(0);
+
+ // Calculate byte_size: for primitive types, use width *
num_rows
+ let byte_size = if let Some(byte_width) =
data_type.primitive_width() {
+ stats.num_rows.multiply(&Precision::Exact(byte_width))
+ } else {
+ // For complex types (Utf8, List, etc.), we can't
calculate exact size
+ // without knowing the actual data, so leave it as Absent
+ Precision::Absent
+ };
+
+ ColumnStatistics {
+ min_value: Precision::Exact(value.clone()),
+ max_value: Precision::Exact(value.clone()),
+ distinct_count,
+ null_count,
+ sum_value: Precision::Absent, // Sum doesn't make sense
for constants
Review Comment:
The comment states "Sum doesn't make sense for constants", but actually the
sum of a constant column would be constant_value * num_rows, which is a
meaningful and potentially useful statistic (similar to how cross joins
multiply sum_value by row count). Consider calculating sum_value as
stats.num_rows.multiply(&Precision::Exact(value.clone())) for numeric types, or
document why it's intentionally left as Absent.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]