This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 14a7adec05 Fix ambiguous column names in substrait conversion as a 
result of literals having the same name during conversion. (#17299)
14a7adec05 is described below

commit 14a7adec0587ac67063c119bfb40551947869c24
Author: Xander <zander...@googlemail.com>
AuthorDate: Wed Sep 10 20:26:19 2025 +0100

    Fix ambiguous column names in substrait conversion as a result of literals 
having the same name during conversion. (#17299)
    
    * Fix ambigious column names in substrate conversion as a result of 
literals having the same names
    
    * move it to the project
    
    * do it only for projects
    
    * comment
    
    * fmt
    
    * comments
    
    ---------
    
    Co-authored-by: Xander Bailey <xbai...@palantir.com>
    Co-authored-by: Andrew Lamb <and...@nerdnetworks.org>
---
 Cargo.lock                                         |   1 +
 datafusion/substrait/Cargo.toml                    |   1 +
 .../src/logical_plan/consumer/rel/project_rel.rs   |  15 +-
 .../substrait/tests/cases/consumer_integration.rs  |  40 +--
 datafusion/substrait/tests/cases/logical_plans.rs  |  41 +++
 ...mbiguate_literals_with_same_name.substrait.json | 287 +++++++++++++++++++++
 6 files changed, 368 insertions(+), 17 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 471967d009..a5d2e17791 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2747,6 +2747,7 @@ dependencies = [
  "substrait",
  "tokio",
  "url",
+ "uuid",
 ]
 
 [[package]]
diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml
index 63a69b4886..69bf4e8bcd 100644
--- a/datafusion/substrait/Cargo.toml
+++ b/datafusion/substrait/Cargo.toml
@@ -42,6 +42,7 @@ prost = { workspace = true }
 substrait = { version = "0.58", features = ["serde"] }
 url = { workspace = true }
 tokio = { workspace = true, features = ["fs"] }
+uuid = { version = "1.17.0", features = ["v4"] }
 
 [dev-dependencies]
 datafusion = { workspace = true, features = ["nested_expressions"] }
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs 
b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
index 8ece639297..239073108c 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
@@ -62,7 +62,20 @@ pub async fn from_project_rel(
                 // to transform it into a column reference
                 window_exprs.insert(e.clone());
             }
-            explicit_exprs.push(name_tracker.get_uniquely_named_expr(e)?);
+            // Substrait plans are ordinal based, so they do not provide names 
for columns.
+            // Names for columns are generated by Datafusion during 
conversion, and for literals
+            // Datafusion produces names based on the literal value. It is 
possible to construct
+            // valid Substrait plans that result in duplicated names if the 
same literal value is
+            // used in multiple relations. To avoid this issue, we alias 
literals with unique names.
+            // The name tracker will ensure that two literals in the same 
project would have
+            // unique names but, it does not ensure that if a literal column 
exists in a previous
+            // project say before a join that it is deduplicated with respect 
to those columns.
+            // See: https://github.com/apache/datafusion/pull/17299
+            let maybe_apply_alias = match e {
+                lit @ Expr::Literal(_, _) => 
lit.alias(uuid::Uuid::new_v4().to_string()),
+                _ => e,
+            };
+            
explicit_exprs.push(name_tracker.get_uniquely_named_expr(maybe_apply_alias)?);
         }
 
         let input = if !window_exprs.is_empty() {
diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs 
b/datafusion/substrait/tests/cases/consumer_integration.rs
index 2845cc304b..6ea0de9379 100644
--- a/datafusion/substrait/tests/cases/consumer_integration.rs
+++ b/datafusion/substrait/tests/cases/consumer_integration.rs
@@ -647,23 +647,31 @@ mod tests {
     #[tokio::test]
     async fn test_multiple_unions() -> Result<()> {
         let plan_str = test_plan_to_string("multiple_unions.json").await?;
-        assert_snapshot!(
-        plan_str,
-        @r#"
-        Projection: Utf8("people") AS product_category, 
Utf8("people")__temp__0 AS product_type, product_key
-          Union
-            Projection: Utf8("people"), Utf8("people") AS 
Utf8("people")__temp__0, sales.product_key
-              Left Join: sales.product_key = food.@food_id
-                TableScan: sales
-                TableScan: food
-            Union
-              Projection: people.$f3, people.$f5, people.product_key0
-                Left Join: people.product_key0 = food.@food_id
-                  TableScan: people
-                  TableScan: food
-              TableScan: more_products
-        "#
+
+        let mut settings = insta::Settings::clone_current();
+        settings.add_filter(
+            r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
+            "[UUID]",
+        );
+        settings.bind(|| {
+            assert_snapshot!(
+            plan_str,
+            @r#"
+            Projection: [UUID] AS product_category, [UUID] AS product_type, 
product_key
+              Union
+                Projection: Utf8("people") AS [UUID], Utf8("people") AS 
[UUID], sales.product_key
+                  Left Join: sales.product_key = food.@food_id
+                    TableScan: sales
+                    TableScan: food
+                Union
+                  Projection: people.$f3, people.$f5, people.product_key0
+                    Left Join: people.product_key0 = food.@food_id
+                      TableScan: people
+                      TableScan: food
+                  TableScan: more_products
+            "#
         );
+        });
 
         Ok(())
     }
diff --git a/datafusion/substrait/tests/cases/logical_plans.rs 
b/datafusion/substrait/tests/cases/logical_plans.rs
index 4dd9719303..426f3c12e5 100644
--- a/datafusion/substrait/tests/cases/logical_plans.rs
+++ b/datafusion/substrait/tests/cases/logical_plans.rs
@@ -144,6 +144,47 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn null_literal_before_and_after_joins() -> Result<()> {
+        // Confirms that literals used before and after a join but for 
different columns
+        // are correctly handled.
+
+        // File generated with substrait-java's Isthmus:
+        // ./isthmus-cli/build/graal/isthmus --create "create table A (a int); 
create table B (a int, c int); create table C (a int, d int)" "select t.*, C.d, 
CAST(NULL AS VARCHAR) as e from (select a, CAST(NULL AS VARCHAR) as c from A 
UNION ALL select a, c from B) t LEFT JOIN C ON t.a = C.a"
+        let proto_plan =
+            
read_json("tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json");
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
+
+        let mut settings = insta::Settings::clone_current();
+        settings.add_filter(
+            r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
+            "[UUID]",
+        );
+        settings.bind(|| {
+            assert_snapshot!(
+                plan,
+                @r#"
+            Projection: left.A, left.[UUID] AS C, right.D, Utf8(NULL) AS 
[UUID] AS E
+              Left Join: left.A = right.A
+                SubqueryAlias: left
+                  Union
+                    Projection: A.A, Utf8(NULL) AS [UUID]
+                      TableScan: A
+                    Projection: B.A, CAST(B.C AS Utf8)
+                      TableScan: B
+                SubqueryAlias: right
+                  TableScan: C
+            "#
+            );
+        });
+
+        // Trigger execution to ensure plan validity
+        DataFrame::new(ctx.state(), plan).show().await?;
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn non_nullable_lists() -> Result<()> {
         // DataFusion's Substrait consumer treats all lists as nullable, even 
if the Substrait plan specifies them as non-nullable.
diff --git 
a/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json
 
b/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json
new file mode 100644
index 0000000000..d72830898f
--- /dev/null
+++ 
b/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json
@@ -0,0 +1,287 @@
+{
+  "extensionUris": [{
+    "extensionUriAnchor": 1,
+    "uri": "/functions_comparison.yaml"
+  }],
+  "extensions": [{
+    "extensionFunction": {
+      "extensionUriReference": 1,
+      "functionAnchor": 1,
+      "name": "equal:any_any"
+    }
+  }],
+  "relations": [{
+    "root": {
+      "input": {
+        "project": {
+          "common": {
+            "emit": {
+              "outputMapping": [4, 5, 6, 7]
+            }
+          },
+          "input": {
+            "join": {
+              "common": {
+                "direct": {
+                }
+              },
+              "left": {
+                "set": {
+                  "common": {
+                    "direct": {
+                    }
+                  },
+                  "inputs": [{
+                    "project": {
+                      "common": {
+                        "emit": {
+                          "outputMapping": [1, 2]
+                        }
+                      },
+                      "input": {
+                        "read": {
+                          "common": {
+                            "direct": {
+                            }
+                          },
+                          "baseSchema": {
+                            "names": ["A"],
+                            "struct": {
+                              "types": [{
+                                "i32": {
+                                  "typeVariationReference": 0,
+                                  "nullability": "NULLABILITY_NULLABLE"
+                                }
+                              }],
+                              "typeVariationReference": 0,
+                              "nullability": "NULLABILITY_REQUIRED"
+                            }
+                          },
+                          "namedTable": {
+                            "names": ["A"]
+                          }
+                        }
+                      },
+                      "expressions": [{
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }, {
+                        "literal": {
+                          "null": {
+                            "string": {
+                              "typeVariationReference": 0,
+                              "nullability": "NULLABILITY_NULLABLE"
+                            }
+                          },
+                          "nullable": false,
+                          "typeVariationReference": 0
+                        }
+                      }]
+                    }
+                  }, {
+                    "project": {
+                      "common": {
+                        "emit": {
+                          "outputMapping": [2, 3]
+                        }
+                      },
+                      "input": {
+                        "read": {
+                          "common": {
+                            "direct": {
+                            }
+                          },
+                          "baseSchema": {
+                            "names": ["A", "C"],
+                            "struct": {
+                              "types": [{
+                                "i32": {
+                                  "typeVariationReference": 0,
+                                  "nullability": "NULLABILITY_NULLABLE"
+                                }
+                              }, {
+                                "i32": {
+                                  "typeVariationReference": 0,
+                                  "nullability": "NULLABILITY_NULLABLE"
+                                }
+                              }],
+                              "typeVariationReference": 0,
+                              "nullability": "NULLABILITY_REQUIRED"
+                            }
+                          },
+                          "namedTable": {
+                            "names": ["B"]
+                          }
+                        }
+                      },
+                      "expressions": [{
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }, {
+                        "cast": {
+                          "type": {
+                            "string": {
+                              "typeVariationReference": 0,
+                              "nullability": "NULLABILITY_NULLABLE"
+                            }
+                          },
+                          "input": {
+                            "selection": {
+                              "directReference": {
+                                "structField": {
+                                  "field": 1
+                                }
+                              },
+                              "rootReference": {
+                              }
+                            }
+                          },
+                          "failureBehavior": "FAILURE_BEHAVIOR_THROW_EXCEPTION"
+                        }
+                      }]
+                    }
+                  }],
+                  "op": "SET_OP_UNION_ALL"
+                }
+              },
+              "right": {
+                "read": {
+                  "common": {
+                    "direct": {
+                    }
+                  },
+                  "baseSchema": {
+                    "names": ["A", "D"],
+                    "struct": {
+                      "types": [{
+                        "i32": {
+                          "typeVariationReference": 0,
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      }, {
+                        "i32": {
+                          "typeVariationReference": 0,
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      }],
+                      "typeVariationReference": 0,
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  },
+                  "namedTable": {
+                    "names": ["C"]
+                  }
+                }
+              },
+              "expression": {
+                "scalarFunction": {
+                  "functionReference": 1,
+                  "args": [],
+                  "outputType": {
+                    "bool": {
+                      "typeVariationReference": 0,
+                      "nullability": "NULLABILITY_NULLABLE"
+                    }
+                  },
+                  "arguments": [{
+                    "value": {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 0
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }
+                  }, {
+                    "value": {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 2
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }
+                  }],
+                  "options": []
+                }
+              },
+              "type": "JOIN_TYPE_LEFT"
+            }
+          },
+          "expressions": [{
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 0
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 1
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 3
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "literal": {
+              "null": {
+                "string": {
+                  "typeVariationReference": 0,
+                  "nullability": "NULLABILITY_NULLABLE"
+                }
+              },
+              "nullable": false,
+              "typeVariationReference": 0
+            }
+          }]
+        }
+      },
+      "names": ["A", "C", "D", "E"]
+    }
+  }],
+  "expectedTypeUrls": [],
+  "version": {
+    "majorNumber": 0,
+    "minorNumber": 74,
+    "patchNumber": 0,
+    "gitHash": "",
+    "producer": "isthmus"
+  },
+  "parameterBindings": []
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

Reply via email to