This is an automated email from the ASF dual-hosted git repository. jeffreyvo pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new bb0c3ff8cd Document how schema projection works. (#17250) bb0c3ff8cd is described below commit bb0c3ff8cdff511fc5774b9c08049ad34f238647 Author: wiedld <wie...@users.noreply.github.com> AuthorDate: Fri Sep 12 19:20:45 2025 -0700 Document how schema projection works. (#17250) * chore: add docs for projection's handling of field property resolution * chore: document that Alias metadata precludes alias trimming * chore: document merge_consecutive_projections() * chore: document OptimizeProjections struct * chore: update docs links. --- datafusion/expr/src/expr_schema.rs | 46 ++++++++++++++++++++++ datafusion/expr/src/logical_plan/plan.rs | 8 ++++ datafusion/expr/src/utils.rs | 18 ++++++++- .../optimizer/src/optimize_projections/mod.rs | 33 +++++++++++++++- 4 files changed, 103 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 8ca479bb6f..27f58d7d81 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -371,8 +371,54 @@ impl ExprSchemable for Expr { /// Returns a [arrow::datatypes::Field] compatible with this expression. /// + /// This function converts an expression into a field with appropriate metadata + /// and nullability based on the expression type and context. It is the primary + /// mechanism for determining field-level schemas. + /// + /// # Field Property Resolution + /// + /// For each expression, the following properties are determined: + /// + /// ## Data Type Resolution + /// - **Column references**: Data type from input schema field + /// - **Literals**: Data type inferred from literal value + /// - **Aliases**: Data type inherited from the underlying expression (the aliased expression) + /// - **Binary expressions**: Result type from type coercion rules + /// - **Boolean expressions**: Always a boolean type + /// - **Cast expressions**: Target data type from cast operation + /// - **Function calls**: Return type based on function signature and argument types + /// + /// ## Nullability Determination + /// - **Column references**: Inherit nullability from input schema field + /// - **Literals**: Nullable only if literal value is NULL + /// - **Aliases**: Inherit nullability from the underlying expression (the aliased expression) + /// - **Binary expressions**: Nullable if either operand is nullable + /// - **Boolean expressions**: Always non-nullable (IS NULL, EXISTS, etc.) + /// - **Cast expressions**: determined by the input expression's nullability rules + /// - **Function calls**: Based on function nullability rules and input nullability + /// + /// ## Metadata Handling + /// - **Column references**: Preserve original field metadata from input schema + /// - **Literals**: Use explicitly provided metadata, otherwise empty + /// - **Aliases**: Merge underlying expr metadata with alias-specific metadata, preferring the alias metadata + /// - **Binary expressions**: field metadata is empty + /// - **Boolean expressions**: field metadata is empty + /// - **Cast expressions**: determined by the input expression's field metadata handling + /// - **Scalar functions**: Generate metadata via function's [`return_field_from_args`] method, + /// with the default implementation returning empty field metadata + /// - **Aggregate functions**: Generate metadata via function's [`return_field`] method, + /// with the default implementation returning empty field metadata + /// - **Window functions**: field metadata is empty + /// + /// ## Table Reference Scoping + /// - Establishes proper qualified field references when columns belong to specific tables + /// - Maintains table context for accurate field resolution in multi-table scenarios + /// /// So for example, a projected expression `col(c1) + col(c2)` is /// placed in an output field **named** col("c1 + c2") + /// + /// [`return_field_from_args`]: crate::ScalarUDF::return_field_from_args + /// [`return_field`]: crate::AggregateUDF::return_field fn to_field( &self, schema: &dyn ExprSchema, diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 03d60084ee..fba43780ef 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2195,14 +2195,22 @@ impl Projection { /// will be computed. /// * `exprs`: A slice of `Expr` expressions representing the projection operation to apply. /// +/// # Metadata Handling +/// +/// - **Schema-level metadata**: Passed through unchanged from the input schema +/// - **Field-level metadata**: Determined by each expression via [`exprlist_to_fields`], which +/// calls [`Expr::to_field`] to handle expression-specific metadata (literals, aliases, etc.) +/// /// # Returns /// /// A `Result` containing an `Arc<DFSchema>` representing the schema of the result /// produced by the projection operation. If the schema computation is successful, /// the `Result` will contain the schema; otherwise, it will contain an error. pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result<Arc<DFSchema>> { + // Preserve input schema metadata at the schema level let metadata = input.schema().metadata().clone(); + // Convert expressions to fields with Field properties determined by `Expr::to_field` let schema = DFSchema::new_with_metadata(exprlist_to_fields(exprs, input)?, metadata)? .with_functional_dependencies(calc_func_dependencies_for_project( diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 2e364d0d2b..b68b2aae02 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -690,7 +690,23 @@ where err } -/// Create field meta-data from an expression, for use in a result set schema +/// Create schema fields from an expression list, for use in result set schema construction +/// +/// This function converts a list of expressions into a list of complete schema fields, +/// making comprehensive determinations about each field's properties including: +/// - **Data type**: Resolved based on expression type and input schema context +/// - **Nullability**: Determined by expression-specific nullability rules +/// - **Metadata**: Computed based on expression type (preserving, merging, or generating new metadata) +/// - **Table reference scoping**: Establishing proper qualified field references +/// +/// Each expression is converted to a field by calling [`Expr::to_field`], which performs +/// the complete field resolution process for all field properties. +/// +/// # Returns +/// +/// A `Result` containing a vector of `(Option<TableReference>, Arc<Field>)` tuples, +/// where each Field contains complete schema information (type, nullability, metadata) +/// and proper table reference scoping for the corresponding expression. pub fn exprlist_to_fields<'a>( exprs: impl IntoIterator<Item = &'a Expr>, plan: &LogicalPlan, diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 1e9c40a5d7..d6e3f6051f 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -55,6 +55,24 @@ use datafusion_common::tree_node::{ /// The rule analyzes the input logical plan, determines the necessary column /// indices, and then removes any unnecessary columns. It also removes any /// unnecessary projections from the plan tree. +/// +/// ## Schema, Field Properties, and Metadata Handling +/// +/// The `OptimizeProjections` rule preserves schema and field metadata in most optimization scenarios: +/// +/// **Schema-level metadata preservation by plan type**: +/// - **Window and Aggregate plans**: Schema metadata is preserved +/// - **Projection plans**: Schema metadata is preserved per [`projection_schema`](datafusion_expr::logical_plan::projection_schema). +/// - **Other logical plans**: Schema metadata is preserved unless [`LogicalPlan::recompute_schema`] +/// is called on plan types that drop metadata +/// +/// **Field-level properties and metadata**: Individual field properties are preserved when fields +/// are retained in the optimized plan, determined by [`exprlist_to_fields`](datafusion_expr::utils::exprlist_to_fields) +/// and [`ExprSchemable::to_field`](datafusion_expr::expr_schema::ExprSchemable::to_field). +/// +/// **Field precedence**: When the same field appears multiple times, the optimizer +/// maintains one occurrence and removes duplicates (refer to `RequiredIndices::compact()`), +/// preserving the properties and metadata of that occurrence. #[derive(Default, Debug)] pub struct OptimizeProjections {} @@ -435,6 +453,18 @@ fn optimize_projections( /// appear more than once in its input fields. This can act as a caching mechanism /// for non-trivial computations. /// +/// ## Metadata Handling During Projection Merging +/// +/// **Alias metadata preservation**: When merging projections, alias metadata from both +/// the current and previous projections is carefully preserved. The presence of metadata +/// precludes alias trimming. +/// +/// **Schema, Fields, and metadata**: If a projection is rewritten, the schema and metadata +/// are preserved. Individual field properties and metadata flows through expression rewriting +/// and are preserved when fields are referenced in the merged projection. +/// Refer to [`projection_schema`](datafusion_expr::logical_plan::projection_schema) +/// for more details. +/// /// # Parameters /// /// * `proj` - A reference to the `Projection` to be merged. @@ -558,7 +588,8 @@ fn is_expr_trivial(expr: &Expr) -> bool { /// - `Err(error)`: An error occurred during the function call. /// /// # Notes -/// This rewrite also removes any unnecessary layers of aliasing. +/// This rewrite also removes any unnecessary layers of aliasing. "Unnecessary" is +/// defined as not contributing new information, such as metadata. /// /// Without trimming, we can end up with unnecessary indirections inside expressions /// during projection merges. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org