This is an automated email from the ASF dual-hosted git repository.

jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new bb0c3ff8cd Document how schema projection works. (#17250)
bb0c3ff8cd is described below

commit bb0c3ff8cdff511fc5774b9c08049ad34f238647
Author: wiedld <wie...@users.noreply.github.com>
AuthorDate: Fri Sep 12 19:20:45 2025 -0700

    Document how schema projection works. (#17250)
    
    * chore: add docs for projection's handling of field property resolution
    
    * chore: document that Alias metadata precludes alias trimming
    
    * chore: document merge_consecutive_projections()
    
    * chore: document OptimizeProjections struct
    
    * chore: update docs links.
---
 datafusion/expr/src/expr_schema.rs                 | 46 ++++++++++++++++++++++
 datafusion/expr/src/logical_plan/plan.rs           |  8 ++++
 datafusion/expr/src/utils.rs                       | 18 ++++++++-
 .../optimizer/src/optimize_projections/mod.rs      | 33 +++++++++++++++-
 4 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/datafusion/expr/src/expr_schema.rs 
b/datafusion/expr/src/expr_schema.rs
index 8ca479bb6f..27f58d7d81 100644
--- a/datafusion/expr/src/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -371,8 +371,54 @@ impl ExprSchemable for Expr {
 
     /// Returns a [arrow::datatypes::Field] compatible with this expression.
     ///
+    /// This function converts an expression into a field with appropriate 
metadata
+    /// and nullability based on the expression type and context. It is the 
primary
+    /// mechanism for determining field-level schemas.
+    ///
+    /// # Field Property Resolution
+    ///
+    /// For each expression, the following properties are determined:
+    ///
+    /// ## Data Type Resolution
+    /// - **Column references**: Data type from input schema field
+    /// - **Literals**: Data type inferred from literal value
+    /// - **Aliases**: Data type inherited from the underlying expression (the 
aliased expression)
+    /// - **Binary expressions**: Result type from type coercion rules
+    /// - **Boolean expressions**: Always a boolean type
+    /// - **Cast expressions**: Target data type from cast operation
+    /// - **Function calls**: Return type based on function signature and 
argument types
+    ///
+    /// ## Nullability Determination
+    /// - **Column references**: Inherit nullability from input schema field
+    /// - **Literals**: Nullable only if literal value is NULL
+    /// - **Aliases**: Inherit nullability from the underlying expression (the 
aliased expression)
+    /// - **Binary expressions**: Nullable if either operand is nullable
+    /// - **Boolean expressions**: Always non-nullable (IS NULL, EXISTS, etc.)
+    /// - **Cast expressions**: determined by the input expression's 
nullability rules
+    /// - **Function calls**: Based on function nullability rules and input 
nullability
+    ///
+    /// ## Metadata Handling
+    /// - **Column references**: Preserve original field metadata from input 
schema
+    /// - **Literals**: Use explicitly provided metadata, otherwise empty
+    /// - **Aliases**: Merge underlying expr metadata with alias-specific 
metadata, preferring the alias metadata
+    /// - **Binary expressions**: field metadata is empty
+    /// - **Boolean expressions**: field metadata is empty
+    /// - **Cast expressions**: determined by the input expression's field 
metadata handling
+    /// - **Scalar functions**: Generate metadata via function's 
[`return_field_from_args`] method,
+    ///   with the default implementation returning empty field metadata
+    /// - **Aggregate functions**: Generate metadata via function's 
[`return_field`] method,
+    ///   with the default implementation returning empty field metadata
+    /// - **Window functions**: field metadata is empty
+    ///
+    /// ## Table Reference Scoping
+    /// - Establishes proper qualified field references when columns belong to 
specific tables
+    /// - Maintains table context for accurate field resolution in multi-table 
scenarios
+    ///
     /// So for example, a projected expression `col(c1) + col(c2)` is
     /// placed in an output field **named** col("c1 + c2")
+    ///
+    /// [`return_field_from_args`]: crate::ScalarUDF::return_field_from_args
+    /// [`return_field`]: crate::AggregateUDF::return_field
     fn to_field(
         &self,
         schema: &dyn ExprSchema,
diff --git a/datafusion/expr/src/logical_plan/plan.rs 
b/datafusion/expr/src/logical_plan/plan.rs
index 03d60084ee..fba43780ef 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -2195,14 +2195,22 @@ impl Projection {
 ///   will be computed.
 /// * `exprs`: A slice of `Expr` expressions representing the projection 
operation to apply.
 ///
+/// # Metadata Handling
+///
+/// - **Schema-level metadata**: Passed through unchanged from the input schema
+/// - **Field-level metadata**: Determined by each expression via 
[`exprlist_to_fields`], which
+///   calls [`Expr::to_field`] to handle expression-specific metadata 
(literals, aliases, etc.)
+///
 /// # Returns
 ///
 /// A `Result` containing an `Arc<DFSchema>` representing the schema of the 
result
 /// produced by the projection operation. If the schema computation is 
successful,
 /// the `Result` will contain the schema; otherwise, it will contain an error.
 pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> 
Result<Arc<DFSchema>> {
+    // Preserve input schema metadata at the schema level
     let metadata = input.schema().metadata().clone();
 
+    // Convert expressions to fields with Field properties determined by 
`Expr::to_field`
     let schema =
         DFSchema::new_with_metadata(exprlist_to_fields(exprs, input)?, 
metadata)?
             .with_functional_dependencies(calc_func_dependencies_for_project(
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 2e364d0d2b..b68b2aae02 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -690,7 +690,23 @@ where
     err
 }
 
-/// Create field meta-data from an expression, for use in a result set schema
+/// Create schema fields from an expression list, for use in result set schema 
construction
+///
+/// This function converts a list of expressions into a list of complete 
schema fields,
+/// making comprehensive determinations about each field's properties 
including:
+/// - **Data type**: Resolved based on expression type and input schema context
+/// - **Nullability**: Determined by expression-specific nullability rules
+/// - **Metadata**: Computed based on expression type (preserving, merging, or 
generating new metadata)
+/// - **Table reference scoping**: Establishing proper qualified field 
references
+///
+/// Each expression is converted to a field by calling [`Expr::to_field`], 
which performs
+/// the complete field resolution process for all field properties.
+///
+/// # Returns
+///
+/// A `Result` containing a vector of `(Option<TableReference>, Arc<Field>)` 
tuples,
+/// where each Field contains complete schema information (type, nullability, 
metadata)
+/// and proper table reference scoping for the corresponding expression.
 pub fn exprlist_to_fields<'a>(
     exprs: impl IntoIterator<Item = &'a Expr>,
     plan: &LogicalPlan,
diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs 
b/datafusion/optimizer/src/optimize_projections/mod.rs
index 1e9c40a5d7..d6e3f6051f 100644
--- a/datafusion/optimizer/src/optimize_projections/mod.rs
+++ b/datafusion/optimizer/src/optimize_projections/mod.rs
@@ -55,6 +55,24 @@ use datafusion_common::tree_node::{
 /// The rule analyzes the input logical plan, determines the necessary column
 /// indices, and then removes any unnecessary columns. It also removes any
 /// unnecessary projections from the plan tree.
+///
+/// ## Schema, Field Properties, and Metadata Handling
+///
+/// The `OptimizeProjections` rule preserves schema and field metadata in most 
optimization scenarios:
+///
+/// **Schema-level metadata preservation by plan type**:
+/// - **Window and Aggregate plans**: Schema metadata is preserved
+/// - **Projection plans**: Schema metadata is preserved per 
[`projection_schema`](datafusion_expr::logical_plan::projection_schema).
+/// - **Other logical plans**: Schema metadata is preserved unless 
[`LogicalPlan::recompute_schema`]
+///   is called on plan types that drop metadata
+///
+/// **Field-level properties and metadata**: Individual field properties are 
preserved when fields
+/// are retained in the optimized plan, determined by 
[`exprlist_to_fields`](datafusion_expr::utils::exprlist_to_fields)
+/// and 
[`ExprSchemable::to_field`](datafusion_expr::expr_schema::ExprSchemable::to_field).
+///
+/// **Field precedence**: When the same field appears multiple times, the 
optimizer
+/// maintains one occurrence and removes duplicates (refer to 
`RequiredIndices::compact()`),
+/// preserving the properties and metadata of that occurrence.
 #[derive(Default, Debug)]
 pub struct OptimizeProjections {}
 
@@ -435,6 +453,18 @@ fn optimize_projections(
 /// appear more than once in its input fields. This can act as a caching 
mechanism
 /// for non-trivial computations.
 ///
+/// ## Metadata Handling During Projection Merging
+///
+/// **Alias metadata preservation**: When merging projections, alias metadata 
from both
+/// the current and previous projections is carefully preserved. The presence 
of metadata
+/// precludes alias trimming.
+///
+/// **Schema, Fields, and metadata**: If a projection is rewritten, the schema 
and metadata
+/// are preserved. Individual field properties and metadata flows through 
expression rewriting
+/// and are preserved when fields are referenced in the merged projection.
+/// Refer to 
[`projection_schema`](datafusion_expr::logical_plan::projection_schema)
+/// for more details.
+///
 /// # Parameters
 ///
 /// * `proj` - A reference to the `Projection` to be merged.
@@ -558,7 +588,8 @@ fn is_expr_trivial(expr: &Expr) -> bool {
 /// - `Err(error)`: An error occurred during the function call.
 ///
 /// # Notes
-/// This rewrite also removes any unnecessary layers of aliasing.
+/// This rewrite also removes any unnecessary layers of aliasing. 
"Unnecessary" is
+/// defined as not contributing new information, such as metadata.
 ///
 /// Without trimming, we can end up with unnecessary indirections inside 
expressions
 /// during projection merges.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

Reply via email to