Brijesh-Thakkar commented on code in PR #22640:
URL: https://github.com/apache/datafusion/pull/22640#discussion_r3378303333
##########
datafusion/expr/src/logical_plan/plan.rs:
##########
@@ -705,20 +705,76 @@ impl LogicalPlan {
}))
}
LogicalPlan::Union(Union { inputs, schema }) => {
- let first_input_schema = inputs[0].schema();
- if schema.fields().len() == first_input_schema.fields().len() {
- // If inputs are not pruned do not change schema
- Ok(LogicalPlan::Union(Union { inputs, schema }))
- } else {
- // A note on `Union`s constructed via `try_new_by_name`:
- //
- // At this point, the schema for each input should have
- // the same width. Thus, we do not need to save whether a
- // `Union` was created `BY NAME`, and can safely rely on
the
- // `try_new` initializer to derive the new schema based on
- // column positions.
- Ok(LogicalPlan::Union(Union::try_new(inputs)?))
+ // Fast path: if all inputs structurally match the cached
schema
+ // (field count, types, names, qualifiers, nullability) then no
+ // recomputation is needed and we avoid any allocation.
+ let schemas_match = inputs.iter().all(|input| {
+ let input_schema = input.schema();
+ schema.fields().len() == input_schema.fields().len()
+ && schema.iter().zip(input_schema.iter()).all(
+ |((q1, f1), (q2, f2))| {
+ q1 == q2
+ && f1.name() == f2.name()
+ && f1.data_type() == f2.data_type()
+ && f1.is_nullable() == f2.is_nullable()
+ },
+ )
+ });
+ if schemas_match {
+ // Inputs are structurally identical to the cached schema.
+ return Ok(LogicalPlan::Union(Union { inputs, schema }));
+ }
+
+ // Slow path: inputs changed — recompute the schema.
+ //
+ // NOTE: A note on `Union`s constructed via `try_new_by_name`:
+ // At this point, the schema for each input should have
+ // the same width. Thus, we do not need to save whether a
+ // `Union` was created `BY NAME`, and can safely rely on the
+ // `try_new` initializer to derive the new schema based on
+ // column positions.
+ let mut recomputed = Union::try_new(inputs)?;
+
+ // Metadata preservation: Union::try_new uses intersection
logic
+ // for metadata, but we want "later takes precedence" (extend
semantics)
+ // to match coerce_union_schema_with_schema in
type_coercion.rs.
+ let mut merged_metadata =
+ recomputed.inputs[0].schema().metadata().clone();
+ for input in recomputed.inputs.iter().skip(1) {
+ merged_metadata.extend(input.schema().metadata().clone());
+ }
+
+ let mut merged_field_metadata = recomputed.inputs[0]
+ .schema()
+ .fields()
+ .iter()
+ .map(|f| f.metadata().clone())
Review Comment:
Moved the metadata merging logic into a new Union::try_new_with_metadata()
method on the Union struct itself. The slow path in recompute_schema() now
delegates to it with a single call. This method can also be reused by
coerce_union_schema_with_schema in a follow-up if desired.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]