This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new dcc018ee8f Disallow duplicated qualified field names (#12608)
dcc018ee8f is described below
commit dcc018ee8fe8ccc768368ac1441d63983dfe3193
Author: Emil Ejbyfeldt <[email protected]>
AuthorDate: Thu Oct 3 00:00:36 2024 +0200
Disallow duplicated qualified field names (#12608)
* Disallow duplicated qualified field names
* Fix tests
---
datafusion-cli/Cargo.lock | 1 +
datafusion/common/src/dfschema.rs | 12 +++++-
datafusion/core/src/dataframe/mod.rs | 46 ----------------------
datafusion/expr/Cargo.toml | 1 +
datafusion/expr/src/logical_plan/plan.rs | 3 ++
datafusion/expr/src/utils.rs | 18 ++++-----
.../optimizer/tests/optimizer_integration.rs | 4 +-
datafusion/sql/src/planner.rs | 6 +--
datafusion/sqllogictest/test_files/join.slt | 6 +--
9 files changed, 30 insertions(+), 67 deletions(-)
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 8bf62a53cc..2d7ff2af89 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1345,6 +1345,7 @@ dependencies = [
"datafusion-functions-aggregate-common",
"datafusion-functions-window-common",
"datafusion-physical-expr-common",
+ "indexmap",
"paste",
"serde_json",
"sqlparser",
diff --git a/datafusion/common/src/dfschema.rs
b/datafusion/common/src/dfschema.rs
index 0dec14e917..69cdf866cf 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -226,7 +226,12 @@ impl DFSchema {
for (field, qualifier) in
self.inner.fields().iter().zip(&self.field_qualifiers) {
if let Some(qualifier) = qualifier {
- qualified_names.insert((qualifier, field.name()));
+ if !qualified_names.insert((qualifier, field.name())) {
+ return _schema_err!(SchemaError::DuplicateQualifiedField {
+ qualifier: Box::new(qualifier.clone()),
+ name: field.name().to_string(),
+ });
+ }
} else if !unqualified_names.insert(field.name()) {
return _schema_err!(SchemaError::DuplicateUnqualifiedField {
name: field.name().to_string()
@@ -1165,7 +1170,10 @@ mod tests {
let left = DFSchema::try_from_qualified_schema("t1",
&test_schema_1())?;
let right = DFSchema::try_from_qualified_schema("t1",
&test_schema_1())?;
let join = left.join(&right);
- assert!(join.err().is_none());
+ assert_eq!(
+ join.unwrap_err().strip_backtrace(),
+ "Schema error: Schema contains duplicate qualified field name
t1.c0",
+ );
Ok(())
}
diff --git a/datafusion/core/src/dataframe/mod.rs
b/datafusion/core/src/dataframe/mod.rs
index 70c5075114..f5867881da 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -3380,52 +3380,6 @@ mod tests {
Ok(())
}
- // Table 't1' self join
- // Supplementary test of issue:
https://github.com/apache/datafusion/issues/7790
- #[tokio::test]
- async fn with_column_self_join() -> Result<()> {
- let df = test_table().await?.select_columns(&["c1"])?;
- let ctx = SessionContext::new();
-
- ctx.register_table("t1", df.into_view())?;
-
- let df = ctx
- .table("t1")
- .await?
- .join(
- ctx.table("t1").await?,
- JoinType::Inner,
- &["c1"],
- &["c1"],
- None,
- )?
- .sort(vec![
- // make the test deterministic
- col("t1.c1").sort(true, true),
- ])?
- .limit(0, Some(1))?;
-
- let df_results = df.clone().collect().await?;
- assert_batches_sorted_eq!(
- [
- "+----+----+",
- "| c1 | c1 |",
- "+----+----+",
- "| a | a |",
- "+----+----+",
- ],
- &df_results
- );
-
- let actual_err = df.clone().with_column("new_column",
lit(true)).unwrap_err();
- let expected_err = "Error during planning: Projections require unique
expression names \
- but the expression \"t1.c1\" at position 0 and \"t1.c1\" at
position 1 have the same name. \
- Consider aliasing (\"AS\") one of them.";
- assert_eq!(actual_err.strip_backtrace(), expected_err);
-
- Ok(())
- }
-
#[tokio::test]
async fn with_column_renamed() -> Result<()> {
let df = test_table()
diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml
index 55387fea22..d7dc1afe4d 100644
--- a/datafusion/expr/Cargo.toml
+++ b/datafusion/expr/Cargo.toml
@@ -48,6 +48,7 @@ datafusion-expr-common = { workspace = true }
datafusion-functions-aggregate-common = { workspace = true }
datafusion-functions-window-common = { workspace = true }
datafusion-physical-expr-common = { workspace = true }
+indexmap = { workspace = true }
paste = "^1.0"
serde_json = { workspace = true }
sqlparser = { workspace = true }
diff --git a/datafusion/expr/src/logical_plan/plan.rs
b/datafusion/expr/src/logical_plan/plan.rs
index 443d23804a..19e73140b7 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -51,6 +51,7 @@ use datafusion_common::{
DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence,
FunctionalDependencies, ParamValues, Result, TableReference, UnnestOptions,
};
+use indexmap::IndexSet;
// backwards compatibility
use crate::display::PgJsonVisitor;
@@ -3071,6 +3072,8 @@ fn calc_func_dependencies_for_aggregate(
let group_by_expr_names = group_expr
.iter()
.map(|item| item.schema_name().to_string())
+ .collect::<IndexSet<_>>()
+ .into_iter()
.collect::<Vec<_>>();
let aggregate_func_dependencies = aggregate_functional_dependencies(
input.schema(),
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 1d8eb9445e..9bb53a1d04 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -38,6 +38,7 @@ use datafusion_common::{
DataFusionError, Result, TableReference,
};
+use indexmap::IndexSet;
use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem};
pub use
datafusion_functions_aggregate_common::order::AggregateOrderSensitivity;
@@ -59,16 +60,7 @@ pub fn exprlist_to_columns(expr: &[Expr], accum: &mut
HashSet<Column>) -> Result
/// Count the number of distinct exprs in a list of group by expressions. If
the
/// first element is a `GroupingSet` expression then it must be the only expr.
pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
- if let Some(Expr::GroupingSet(grouping_set)) = group_expr.first() {
- if group_expr.len() > 1 {
- return plan_err!(
- "Invalid group by expressions, GroupingSet must be the only
expression"
- );
- }
- Ok(grouping_set.distinct_expr().len())
- } else {
- Ok(group_expr.len())
- }
+ grouping_set_to_exprlist(group_expr).map(|exprs| exprs.len())
}
/// The [power set] (or powerset) of a set S is the set of all subsets of S, \
@@ -260,7 +252,11 @@ pub fn grouping_set_to_exprlist(group_expr: &[Expr]) ->
Result<Vec<&Expr>> {
}
Ok(grouping_set.distinct_expr())
} else {
- Ok(group_expr.iter().collect())
+ Ok(group_expr
+ .iter()
+ .collect::<IndexSet<_>>()
+ .into_iter()
+ .collect())
}
}
diff --git a/datafusion/optimizer/tests/optimizer_integration.rs
b/datafusion/optimizer/tests/optimizer_integration.rs
index 470bd947c7..2361679857 100644
--- a/datafusion/optimizer/tests/optimizer_integration.rs
+++ b/datafusion/optimizer/tests/optimizer_integration.rs
@@ -345,7 +345,7 @@ fn select_wildcard_with_repeated_column() {
let sql = "SELECT *, col_int32 FROM test";
let err = test_sql(sql).expect_err("query should have failed");
assert_eq!(
- "expand_wildcard_rule\ncaused by\nError during planning: Projections
require unique expression names but the expression \"test.col_int32\" at
position 0 and \"test.col_int32\" at position 7 have the same name. Consider
aliasing (\"AS\") one of them.",
+ "Schema error: Schema contains duplicate qualified field name
test.col_int32",
err.strip_backtrace()
);
}
@@ -396,7 +396,7 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> {
.with_udaf(count_udaf())
.with_udaf(avg_udaf());
let sql_to_rel = SqlToRel::new(&context_provider);
- let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap();
+ let plan = sql_to_rel.sql_statement_to_plan(statement.clone())?;
let config = OptimizerContext::new().with_skip_failing_rules(false);
let analyzer = Analyzer::new();
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index 5cbe1d7c01..e8defedddf 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -197,9 +197,9 @@ impl PlannerContext {
/// extends the FROM schema, returning the existing one, if any
pub fn extend_outer_from_schema(&mut self, schema: &DFSchemaRef) ->
Result<()> {
- self.outer_from_schema = match self.outer_from_schema.as_ref() {
- Some(from_schema) => Some(Arc::new(from_schema.join(schema)?)),
- None => Some(Arc::clone(schema)),
+ match self.outer_from_schema.as_mut() {
+ Some(from_schema) => Arc::make_mut(from_schema).merge(schema),
+ None => self.outer_from_schema = Some(Arc::clone(schema)),
};
Ok(())
}
diff --git a/datafusion/sqllogictest/test_files/join.slt
b/datafusion/sqllogictest/test_files/join.slt
index 8d801b92c3..519fbb887c 100644
--- a/datafusion/sqllogictest/test_files/join.slt
+++ b/datafusion/sqllogictest/test_files/join.slt
@@ -1215,14 +1215,14 @@ statement ok
create table t1(v1 int) as values(100);
## Query with Ambiguous column reference
-query error DataFusion error: Schema error: Ambiguous reference to unqualified
field v1
+query error DataFusion error: Schema error: Schema contains duplicate
qualified field name t1\.v1
select count(*)
from t1
right outer join t1
on t1.v1 > 0;
-query error DataFusion error: Schema error: Ambiguous reference to unqualified
field v1
+query error DataFusion error: Schema error: Schema contains duplicate
qualified field name t1\.v1
select t1.v1 from t1 join t1 using(v1) cross join (select struct('foo' as v1)
as t1);
statement ok
-drop table t1;
\ No newline at end of file
+drop table t1;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]