This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new 14a7adec05 Fix ambiguous column names in substrait conversion as a result of literals having the same name during conversion. (#17299) 14a7adec05 is described below commit 14a7adec0587ac67063c119bfb40551947869c24 Author: Xander <zander...@googlemail.com> AuthorDate: Wed Sep 10 20:26:19 2025 +0100 Fix ambiguous column names in substrait conversion as a result of literals having the same name during conversion. (#17299) * Fix ambigious column names in substrate conversion as a result of literals having the same names * move it to the project * do it only for projects * comment * fmt * comments --------- Co-authored-by: Xander Bailey <xbai...@palantir.com> Co-authored-by: Andrew Lamb <and...@nerdnetworks.org> --- Cargo.lock | 1 + datafusion/substrait/Cargo.toml | 1 + .../src/logical_plan/consumer/rel/project_rel.rs | 15 +- .../substrait/tests/cases/consumer_integration.rs | 40 +-- datafusion/substrait/tests/cases/logical_plans.rs | 41 +++ ...mbiguate_literals_with_same_name.substrait.json | 287 +++++++++++++++++++++ 6 files changed, 368 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 471967d009..a5d2e17791 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2747,6 +2747,7 @@ dependencies = [ "substrait", "tokio", "url", + "uuid", ] [[package]] diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 63a69b4886..69bf4e8bcd 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -42,6 +42,7 @@ prost = { workspace = true } substrait = { version = "0.58", features = ["serde"] } url = { workspace = true } tokio = { workspace = true, features = ["fs"] } +uuid = { version = "1.17.0", features = ["v4"] } [dev-dependencies] datafusion = { workspace = true, features = ["nested_expressions"] } diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs index 8ece639297..239073108c 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs @@ -62,7 +62,20 @@ pub async fn from_project_rel( // to transform it into a column reference window_exprs.insert(e.clone()); } - explicit_exprs.push(name_tracker.get_uniquely_named_expr(e)?); + // Substrait plans are ordinal based, so they do not provide names for columns. + // Names for columns are generated by Datafusion during conversion, and for literals + // Datafusion produces names based on the literal value. It is possible to construct + // valid Substrait plans that result in duplicated names if the same literal value is + // used in multiple relations. To avoid this issue, we alias literals with unique names. + // The name tracker will ensure that two literals in the same project would have + // unique names but, it does not ensure that if a literal column exists in a previous + // project say before a join that it is deduplicated with respect to those columns. + // See: https://github.com/apache/datafusion/pull/17299 + let maybe_apply_alias = match e { + lit @ Expr::Literal(_, _) => lit.alias(uuid::Uuid::new_v4().to_string()), + _ => e, + }; + explicit_exprs.push(name_tracker.get_uniquely_named_expr(maybe_apply_alias)?); } let input = if !window_exprs.is_empty() { diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 2845cc304b..6ea0de9379 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -647,23 +647,31 @@ mod tests { #[tokio::test] async fn test_multiple_unions() -> Result<()> { let plan_str = test_plan_to_string("multiple_unions.json").await?; - assert_snapshot!( - plan_str, - @r#" - Projection: Utf8("people") AS product_category, Utf8("people")__temp__0 AS product_type, product_key - Union - Projection: Utf8("people"), Utf8("people") AS Utf8("people")__temp__0, sales.product_key - Left Join: sales.product_key = food.@food_id - TableScan: sales - TableScan: food - Union - Projection: people.$f3, people.$f5, people.product_key0 - Left Join: people.product_key0 = food.@food_id - TableScan: people - TableScan: food - TableScan: more_products - "# + + let mut settings = insta::Settings::clone_current(); + settings.add_filter( + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", + "[UUID]", + ); + settings.bind(|| { + assert_snapshot!( + plan_str, + @r#" + Projection: [UUID] AS product_category, [UUID] AS product_type, product_key + Union + Projection: Utf8("people") AS [UUID], Utf8("people") AS [UUID], sales.product_key + Left Join: sales.product_key = food.@food_id + TableScan: sales + TableScan: food + Union + Projection: people.$f3, people.$f5, people.product_key0 + Left Join: people.product_key0 = food.@food_id + TableScan: people + TableScan: food + TableScan: more_products + "# ); + }); Ok(()) } diff --git a/datafusion/substrait/tests/cases/logical_plans.rs b/datafusion/substrait/tests/cases/logical_plans.rs index 4dd9719303..426f3c12e5 100644 --- a/datafusion/substrait/tests/cases/logical_plans.rs +++ b/datafusion/substrait/tests/cases/logical_plans.rs @@ -144,6 +144,47 @@ mod tests { Ok(()) } + #[tokio::test] + async fn null_literal_before_and_after_joins() -> Result<()> { + // Confirms that literals used before and after a join but for different columns + // are correctly handled. + + // File generated with substrait-java's Isthmus: + // ./isthmus-cli/build/graal/isthmus --create "create table A (a int); create table B (a int, c int); create table C (a int, d int)" "select t.*, C.d, CAST(NULL AS VARCHAR) as e from (select a, CAST(NULL AS VARCHAR) as c from A UNION ALL select a, c from B) t LEFT JOIN C ON t.a = C.a" + let proto_plan = + read_json("tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json"); + let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?; + let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?; + + let mut settings = insta::Settings::clone_current(); + settings.add_filter( + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", + "[UUID]", + ); + settings.bind(|| { + assert_snapshot!( + plan, + @r#" + Projection: left.A, left.[UUID] AS C, right.D, Utf8(NULL) AS [UUID] AS E + Left Join: left.A = right.A + SubqueryAlias: left + Union + Projection: A.A, Utf8(NULL) AS [UUID] + TableScan: A + Projection: B.A, CAST(B.C AS Utf8) + TableScan: B + SubqueryAlias: right + TableScan: C + "# + ); + }); + + // Trigger execution to ensure plan validity + DataFrame::new(ctx.state(), plan).show().await?; + + Ok(()) + } + #[tokio::test] async fn non_nullable_lists() -> Result<()> { // DataFusion's Substrait consumer treats all lists as nullable, even if the Substrait plan specifies them as non-nullable. diff --git a/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json b/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json new file mode 100644 index 0000000000..d72830898f --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json @@ -0,0 +1,287 @@ +{ + "extensionUris": [{ + "extensionUriAnchor": 1, + "uri": "/functions_comparison.yaml" + }], + "extensions": [{ + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 1, + "name": "equal:any_any" + } + }], + "relations": [{ + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [4, 5, 6, 7] + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "set": { + "common": { + "direct": { + } + }, + "inputs": [{ + "project": { + "common": { + "emit": { + "outputMapping": [1, 2] + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["A"], + "struct": { + "types": [{ + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["A"] + } + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "literal": { + "null": { + "string": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "nullable": false, + "typeVariationReference": 0 + } + }] + } + }, { + "project": { + "common": { + "emit": { + "outputMapping": [2, 3] + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["A", "C"], + "struct": { + "types": [{ + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["B"] + } + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "cast": { + "type": { + "string": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_THROW_EXCEPTION" + } + }] + } + }], + "op": "SET_OP_UNION_ALL" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["A", "D"], + "struct": { + "types": [{ + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["C"] + } + } + }, + "expression": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, + "type": "JOIN_TYPE_LEFT" + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }, { + "literal": { + "null": { + "string": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "nullable": false, + "typeVariationReference": 0 + } + }] + } + }, + "names": ["A", "C", "D", "E"] + } + }], + "expectedTypeUrls": [], + "version": { + "majorNumber": 0, + "minorNumber": 74, + "patchNumber": 0, + "gitHash": "", + "producer": "isthmus" + }, + "parameterBindings": [] +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org