Copilot commented on code in PR #19419:
URL: https://github.com/apache/datafusion/pull/19419#discussion_r2636892055


##########
datafusion/physical-expr/src/projection.rs:
##########
@@ -2593,4 +2625,68 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_project_statistics_with_literal() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with literal: SELECT 42 AS constant, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
+                alias: "constant".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (literal 42) should have proper constant statistics
+        assert_eq!(
+            output_stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Int64(Some(42)))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(42)))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].null_count,
+            Precision::Exact(0)
+        );
+        // Int64 is 8 bytes, 5 rows = 40 bytes
+        assert_eq!(
+            output_stats.column_statistics[0].byte_size,
+            Precision::Exact(40)
+        );

Review Comment:
   The test should validate the sum_value field for the literal column. While 
setting sum_value to Precision::Absent is a reasonable choice (as mentioned in 
the comment on line 619), the test should explicitly assert this to ensure the 
behavior is documented and verified.



##########
datafusion/physical-expr/src/projection.rs:
##########
@@ -587,6 +587,38 @@ impl ProjectionExprs {
             let expr = &proj_expr.expr;
             let col_stats = if let Some(col) = 
expr.as_any().downcast_ref::<Column>() {
                 std::mem::take(&mut stats.column_statistics[col.index()])
+            } else if let Some(literal) = 
expr.as_any().downcast_ref::<Literal>()
+                && !literal.value().is_null()
+            {
+                // For constant columns (non-null literals), output proper 
statistics
+                let value = literal.value();
+                let data_type = expr.data_type(output_schema)?;
+
+                // For constant columns:
+                // - min_value = max_value = the literal value
+                // - distinct_count = 1
+                // - null_count = 0
+                // - byte_size = calculated from data type and num_rows
+                let distinct_count = Precision::Exact(1);
+                let null_count = Precision::Exact(0);
+
+                // Calculate byte_size: for primitive types, use width * 
num_rows
+                let byte_size = if let Some(byte_width) = 
data_type.primitive_width() {
+                    stats.num_rows.multiply(&Precision::Exact(byte_width))
+                } else {
+                    // For complex types (Utf8, List, etc.), we can't 
calculate exact size
+                    // without knowing the actual data, so leave it as Absent

Review Comment:
   The comment says "we can't calculate exact size without knowing the actual 
data", but we do know the actual data (it's in the value variable). However, 
the issue is that for complex types like Utf8, the byte_size when materialized 
as an array depends on the array encoding (e.g., dictionary encoding, string 
view arrays), which makes exact calculation complex. Consider clarifying the 
comment to say "For complex types, byte_size depends on array encoding and 
representation, so we conservatively set it to Absent" to better explain the 
rationale.
   ```suggestion
                       // For complex types (Utf8, List, etc.), the byte_size 
when materialized
                       // as an array depends on the array encoding and 
representation, so we
                       // conservatively set it to Absent
   ```



##########
datafusion/physical-expr/src/projection.rs:
##########
@@ -2593,4 +2625,68 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_project_statistics_with_literal() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with literal: SELECT 42 AS constant, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
+                alias: "constant".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (literal 42) should have proper constant statistics
+        assert_eq!(
+            output_stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Int64(Some(42)))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(42)))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].null_count,
+            Precision::Exact(0)
+        );
+        // Int64 is 8 bytes, 5 rows = 40 bytes
+        assert_eq!(
+            output_stats.column_statistics[0].byte_size,
+            Precision::Exact(40)
+        );
+
+        // Second column (col0) should preserve statistics
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(5)
+        );
+        assert_eq!(
+            output_stats.column_statistics[1].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(21)))
+        );
+
+        Ok(())
+    }

Review Comment:
   The test only covers non-null literals with primitive types (Int64). 
Consider adding test cases for:
   1. Null literals - these should return ColumnStatistics::new_unknown() as 
per line 591 logic
   2. Complex type literals (e.g., Utf8) - these should have byte_size set to 
Precision::Absent as per line 611
   
   These edge cases are important to verify the complete behavior of the 
literal statistics calculation.



##########
datafusion/physical-expr/src/projection.rs:
##########
@@ -587,6 +587,38 @@ impl ProjectionExprs {
             let expr = &proj_expr.expr;
             let col_stats = if let Some(col) = 
expr.as_any().downcast_ref::<Column>() {
                 std::mem::take(&mut stats.column_statistics[col.index()])
+            } else if let Some(literal) = 
expr.as_any().downcast_ref::<Literal>()
+                && !literal.value().is_null()
+            {
+                // For constant columns (non-null literals), output proper 
statistics
+                let value = literal.value();
+                let data_type = expr.data_type(output_schema)?;
+
+                // For constant columns:
+                // - min_value = max_value = the literal value
+                // - distinct_count = 1
+                // - null_count = 0
+                // - byte_size = calculated from data type and num_rows
+                let distinct_count = Precision::Exact(1);
+                let null_count = Precision::Exact(0);
+
+                // Calculate byte_size: for primitive types, use width * 
num_rows
+                let byte_size = if let Some(byte_width) = 
data_type.primitive_width() {
+                    stats.num_rows.multiply(&Precision::Exact(byte_width))
+                } else {
+                    // For complex types (Utf8, List, etc.), we can't 
calculate exact size
+                    // without knowing the actual data, so leave it as Absent
+                    Precision::Absent
+                };
+
+                ColumnStatistics {
+                    min_value: Precision::Exact(value.clone()),
+                    max_value: Precision::Exact(value.clone()),
+                    distinct_count,
+                    null_count,
+                    sum_value: Precision::Absent, // Sum doesn't make sense 
for constants

Review Comment:
   The comment states "Sum doesn't make sense for constants", but actually the 
sum of a constant column would be constant_value * num_rows, which is a 
meaningful and potentially useful statistic (similar to how cross joins 
multiply sum_value by row count). Consider calculating sum_value as 
stats.num_rows.multiply(&Precision::Exact(value.clone())) for numeric types, or 
document why it's intentionally left as Absent.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to