This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new a1c60a1ba9 feat: Quote column names if required in error messages
(#5778)
a1c60a1ba9 is described below
commit a1c60a1ba98e089d7551637f2a78663e66772d88
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Apr 5 21:45:27 2023 +0200
feat: Quote column names if required in error messages (#5778)
* feat: Quote column names if required in error messages
* Update sqllogictest
* Update test
* Update datafusion/common/src/table_reference.rs
Co-authored-by: Jeffrey <[email protected]>
---------
Co-authored-by: Jeffrey <[email protected]>
---
datafusion/common/src/column.rs | 14 ++--
datafusion/common/src/dfschema.rs | 97 ++++++++++++----------
datafusion/common/src/table_reference.rs | 4 +-
datafusion/common/src/utils.rs | 66 ++++++++++++++-
datafusion/core/src/dataframe.rs | 2 +-
datafusion/core/tests/dataframe.rs | 8 +-
datafusion/core/tests/sql/idenfifers.rs | 8 +-
datafusion/core/tests/sql/references.rs | 2 +-
.../core/tests/sqllogictests/test_files/join.slt | 2 +-
datafusion/expr/src/expr_rewriter/mod.rs | 2 +-
datafusion/sql/tests/integration_test.rs | 18 ++--
11 files changed, 149 insertions(+), 74 deletions(-)
diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs
index 78c1857238..d138bb06cd 100644
--- a/datafusion/common/src/column.rs
+++ b/datafusion/common/src/column.rs
@@ -112,13 +112,15 @@ impl Column {
/// Serialize column into a quoted flat name string
pub fn quoted_flat_name(&self) -> String {
- // TODO: quote identifiers only when special characters present
- // see: https://github.com/apache/arrow-datafusion/issues/5523
match &self.relation {
Some(r) => {
- format!("{}.{}", r.to_quoted_string(),
quote_identifier(&self.name))
+ format!(
+ "{}.{}",
+ r.to_quoted_string(),
+ quote_identifier(self.name.as_str())
+ )
}
- None => quote_identifier(&self.name),
+ None => quote_identifier(&self.name).to_string(),
}
}
@@ -405,7 +407,7 @@ mod tests {
&[],
)
.expect_err("should've failed to find field");
- let expected = r#"Schema error: No field named "z". Valid fields are
"t1"."a", "t1"."b", "t2"."c", "t2"."d", "t3"."a", "t3"."b", "t3"."c", "t3"."d",
"t3"."e"."#;
+ let expected = r#"Schema error: No field named z. Valid fields are
t1.a, t1.b, t2.c, t2.d, t3.a, t3.b, t3.c, t3.d, t3.e."#;
assert_eq!(err.to_string(), expected);
// ambiguous column reference
@@ -416,7 +418,7 @@ mod tests {
&[],
)
.expect_err("should've found ambiguous field");
- let expected = "Schema error: Ambiguous reference to unqualified field
\"a\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field
a";
assert_eq!(err.to_string(), expected);
Ok(())
diff --git a/datafusion/common/src/dfschema.rs
b/datafusion/common/src/dfschema.rs
index 67a367c5ef..bb5820cdaa 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -735,6 +735,8 @@ impl DFField {
#[cfg(test)]
mod tests {
+ use crate::assert_contains;
+
use super::*;
use arrow::datatypes::DataType;
@@ -745,8 +747,28 @@ mod tests {
// lookup with unqualified name "t1.c0"
let err = schema.index_of_column(&col).unwrap_err();
assert_eq!(
- r#"Schema error: No field named "t1.c0". Valid fields are
"t1"."c0", "t1"."c1"."#,
- &format!("{err}")
+ err.to_string(),
+ "Schema error: No field named \"t1.c0\". Valid fields are t1.c0,
t1.c1.",
+ );
+ Ok(())
+ }
+
+ #[test]
+ fn quoted_qualifiers_in_name() -> Result<()> {
+ let col = Column::from_name("t1.c0");
+ let schema = DFSchema::try_from_qualified_schema(
+ "t1",
+ &Schema::new(vec![
+ Field::new("CapitalColumn", DataType::Boolean, true),
+ Field::new("field.with.period", DataType::Boolean, true),
+ ]),
+ )?;
+
+ // lookup with unqualified name "t1.c0"
+ let err = schema.index_of_column(&col).unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Schema error: No field named \"t1.c0\". Valid fields are
t1.\"CapitalColumn\", t1.\"field.with.period\".",
);
Ok(())
}
@@ -819,11 +841,9 @@ mod tests {
let left = DFSchema::try_from_qualified_schema("t1",
&test_schema_1())?;
let right = DFSchema::try_from_qualified_schema("t1",
&test_schema_1())?;
let join = left.join(&right);
- assert!(join.is_err());
assert_eq!(
- "Schema error: Schema contains duplicate \
- qualified field name \"t1\".\"c0\"",
- &format!("{}", join.err().unwrap())
+ join.unwrap_err().to_string(),
+ "Schema error: Schema contains duplicate qualified field name
t1.c0",
);
Ok(())
}
@@ -833,11 +853,9 @@ mod tests {
let left = DFSchema::try_from(test_schema_1())?;
let right = DFSchema::try_from(test_schema_1())?;
let join = left.join(&right);
- assert!(join.is_err());
assert_eq!(
- "Schema error: Schema contains duplicate \
- unqualified field name \"c0\"",
- &format!("{}", join.err().unwrap())
+ join.unwrap_err().to_string(),
+ "Schema error: Schema contains duplicate unqualified field name
c0",
);
Ok(())
}
@@ -872,12 +890,9 @@ mod tests {
let left = DFSchema::try_from_qualified_schema("t1",
&test_schema_1())?;
let right = DFSchema::try_from(test_schema_1())?;
let join = left.join(&right);
- assert!(join.is_err());
- assert_eq!(
- "Schema error: Schema contains qualified \
- field name \"t1\".\"c0\" and unqualified field name \"c0\" which would
be ambiguous",
- &format!("{}", join.err().unwrap())
- );
+ assert_contains!(join.unwrap_err().to_string(),
+ "Schema error: Schema contains qualified \
+ field name t1.c0 and unqualified field name c0 which
would be ambiguous");
Ok(())
}
@@ -885,29 +900,28 @@ mod tests {
#[test]
fn helpful_error_messages() -> Result<()> {
let schema = DFSchema::try_from_qualified_schema("t1",
&test_schema_1())?;
- let expected_help = "Valid fields are \"t1\".\"c0\", \"t1\".\"c1\".";
+ let expected_help = "Valid fields are t1.c0, t1.c1.";
// Pertinent message parts
let expected_err_msg = "Fully qualified field name 't1.c0'";
- assert!(schema
- .field_with_qualified_name(&TableReference::bare("x"), "y")
- .unwrap_err()
- .to_string()
- .contains(expected_help));
- assert!(schema
- .field_with_unqualified_name("y")
- .unwrap_err()
- .to_string()
- .contains(expected_help));
- assert!(schema
- .index_of("y")
- .unwrap_err()
- .to_string()
- .contains(expected_help));
- assert!(schema
- .index_of("t1.c0")
- .unwrap_err()
- .to_string()
- .contains(expected_err_msg));
+ assert_contains!(
+ schema
+ .field_with_qualified_name(&TableReference::bare("x"), "y")
+ .unwrap_err()
+ .to_string(),
+ expected_help
+ );
+ assert_contains!(
+ schema
+ .field_with_unqualified_name("y")
+ .unwrap_err()
+ .to_string(),
+ expected_help
+ );
+ assert_contains!(schema.index_of("y").unwrap_err().to_string(),
expected_help);
+ assert_contains!(
+ schema.index_of("t1.c0").unwrap_err().to_string(),
+ expected_err_msg
+ );
Ok(())
}
@@ -916,16 +930,13 @@ mod tests {
let schema = DFSchema::empty();
let col = Column::from_qualified_name("t1.c0");
- let err = schema.index_of_column(&col).err().unwrap();
- assert_eq!(
- r#"Schema error: No field named "t1"."c0"."#,
- &format!("{err}")
- );
+ let err = schema.index_of_column(&col).unwrap_err();
+ assert_eq!(err.to_string(), "Schema error: No field named t1.c0.");
// the same check without qualifier
let col = Column::from_name("c0");
let err = schema.index_of_column(&col).err().unwrap();
- assert_eq!(r#"Schema error: No field named "c0"."#, &format!("{err}"));
+ assert_eq!("Schema error: No field named c0.", err.to_string());
}
#[test]
diff --git a/datafusion/common/src/table_reference.rs
b/datafusion/common/src/table_reference.rs
index e0558b4cdb..cd05f8082d 100644
--- a/datafusion/common/src/table_reference.rs
+++ b/datafusion/common/src/table_reference.rs
@@ -272,14 +272,14 @@ impl<'a> TableReference<'a> {
/// ```
/// # use datafusion_common::TableReference;
/// let table_reference = TableReference::partial("myschema", "mytable");
- /// assert_eq!(table_reference.to_quoted_string(),
r#""myschema"."mytable""#);
+ /// assert_eq!(table_reference.to_quoted_string(), "myschema.mytable");
///
/// let table_reference = TableReference::partial("MySchema", "MyTable");
/// assert_eq!(table_reference.to_quoted_string(),
r#""MySchema"."MyTable""#);
/// ```
pub fn to_quoted_string(&self) -> String {
match self {
- TableReference::Bare { table } => quote_identifier(table),
+ TableReference::Bare { table } =>
quote_identifier(table).to_string(),
TableReference::Partial { schema, table } => {
format!("{}.{}", quote_identifier(schema),
quote_identifier(table))
}
diff --git a/datafusion/common/src/utils.rs b/datafusion/common/src/utils.rs
index a1226def8e..6872653fb4 100644
--- a/datafusion/common/src/utils.rs
+++ b/datafusion/common/src/utils.rs
@@ -24,6 +24,7 @@ use sqlparser::ast::Ident;
use sqlparser::dialect::GenericDialect;
use sqlparser::parser::{Parser, ParserError};
use sqlparser::tokenizer::{Token, TokenWithLocation};
+use std::borrow::Cow;
use std::cmp::Ordering;
/// Given column vectors, returns row at `idx`.
@@ -166,8 +167,26 @@ where
/// the identifier by replacing it with two double quotes
///
/// e.g. identifier `tab.le"name` becomes `"tab.le""name"`
-pub fn quote_identifier(s: &str) -> String {
- format!("\"{}\"", s.replace('"', "\"\""))
+pub fn quote_identifier(s: &str) -> Cow<str> {
+ if needs_quotes(s) {
+ Cow::Owned(format!("\"{}\"", s.replace('"', "\"\"")))
+ } else {
+ Cow::Borrowed(s)
+ }
+}
+
+/// returns true if this identifier needs quotes
+fn needs_quotes(s: &str) -> bool {
+ let mut chars = s.chars();
+
+ // first char can not be a number unless escaped
+ if let Some(first_char) = chars.next() {
+ if !(first_char.is_ascii_lowercase() || first_char == '_') {
+ return true;
+ }
+ }
+
+ !chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
}
// TODO: remove when can use
https://github.com/sqlparser-rs/sqlparser-rs/issues/805
@@ -464,4 +483,47 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_quote_identifier() -> Result<()> {
+ let cases = vec![
+ ("foo", r#"foo"#),
+ ("_foo", r#"_foo"#),
+ ("foo_bar", r#"foo_bar"#),
+ ("foo-bar", r#""foo-bar""#),
+ // name itself has a period, needs to be quoted
+ ("foo.bar", r#""foo.bar""#),
+ ("Foo", r#""Foo""#),
+ ("Foo.Bar", r#""Foo.Bar""#),
+ // name starting with a number needs to be quoted
+ ("test1", r#"test1"#),
+ ("1test", r#""1test""#),
+ ];
+
+ for (identifier, quoted_identifier) in cases {
+ println!("input:
\n{identifier}\nquoted_identifier:\n{quoted_identifier}");
+
+ assert_eq!(quote_identifier(identifier), quoted_identifier);
+
+ // When parsing the quoted identifier, it should be a
+ // a single identifier without normalization, and not in multiple
parts
+ let quote_style = if quoted_identifier.starts_with('"') {
+ Some('"')
+ } else {
+ None
+ };
+
+ let expected_parsed = vec![Ident {
+ value: identifier.to_string(),
+ quote_style,
+ }];
+
+ assert_eq!(
+ parse_identifiers(quoted_identifier).unwrap(),
+ expected_parsed
+ );
+ }
+
+ Ok(())
+ }
}
diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index e3e2987d3d..d7ce056a33 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -1371,7 +1371,7 @@ mod tests {
let join = left
.join_on(right, JoinType::Inner, [col("c1").eq(col("c1"))])
.expect_err("join didn't fail check");
- let expected = "Schema error: Ambiguous reference to unqualified field
\"c1\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field
c1";
assert_eq!(join.to_string(), expected);
Ok(())
diff --git a/datafusion/core/tests/dataframe.rs
b/datafusion/core/tests/dataframe.rs
index 23c6623ab1..4b2daa100b 100644
--- a/datafusion/core/tests/dataframe.rs
+++ b/datafusion/core/tests/dataframe.rs
@@ -331,7 +331,7 @@ async fn sort_on_ambiguous_column() -> Result<()> {
.sort(vec![col("b").sort(true, true)])
.unwrap_err();
- let expected = "Schema error: Ambiguous reference to unqualified field
\"b\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field b";
assert_eq!(err.to_string(), expected);
Ok(())
}
@@ -350,7 +350,7 @@ async fn group_by_ambiguous_column() -> Result<()> {
.aggregate(vec![col("b")], vec![max(col("a"))])
.unwrap_err();
- let expected = "Schema error: Ambiguous reference to unqualified field
\"b\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field b";
assert_eq!(err.to_string(), expected);
Ok(())
}
@@ -369,7 +369,7 @@ async fn filter_on_ambiguous_column() -> Result<()> {
.filter(col("b").eq(lit(1)))
.unwrap_err();
- let expected = "Schema error: Ambiguous reference to unqualified field
\"b\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field b";
assert_eq!(err.to_string(), expected);
Ok(())
}
@@ -388,7 +388,7 @@ async fn select_ambiguous_column() -> Result<()> {
.select(vec![col("b")])
.unwrap_err();
- let expected = "Schema error: Ambiguous reference to unqualified field
\"b\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field b";
assert_eq!(err.to_string(), expected);
Ok(())
}
diff --git a/datafusion/core/tests/sql/idenfifers.rs
b/datafusion/core/tests/sql/idenfifers.rs
index 1b57f60bd4..23fb26edde 100644
--- a/datafusion/core/tests/sql/idenfifers.rs
+++ b/datafusion/core/tests/sql/idenfifers.rs
@@ -211,28 +211,28 @@ async fn case_insensitive_in_sql_errors() {
.await
.unwrap_err()
.to_string();
- assert_contains!(actual, r#"No field named "column1""#);
+ assert_contains!(actual, r#"No field named column1"#);
let actual = ctx
.sql("SELECT Column1 from test")
.await
.unwrap_err()
.to_string();
- assert_contains!(actual, r#"No field named "column1""#);
+ assert_contains!(actual, r#"No field named column1"#);
let actual = ctx
.sql("SELECT column1 from test")
.await
.unwrap_err()
.to_string();
- assert_contains!(actual, r#"No field named "column1""#);
+ assert_contains!(actual, r#"No field named column1"#);
let actual = ctx
.sql(r#"SELECT "column1" from test"#)
.await
.unwrap_err()
.to_string();
- assert_contains!(actual, r#"No field named "column1""#);
+ assert_contains!(actual, r#"No field named column1"#);
// This should pass (note the quotes)
ctx.sql(r#"SELECT "Column1" from test"#).await.unwrap();
diff --git a/datafusion/core/tests/sql/references.rs
b/datafusion/core/tests/sql/references.rs
index 335bc63086..60191e5213 100644
--- a/datafusion/core/tests/sql/references.rs
+++ b/datafusion/core/tests/sql/references.rs
@@ -67,7 +67,7 @@ async fn qualified_table_references_and_fields() ->
Result<()> {
let error = ctx.sql(sql).await.unwrap_err();
assert_contains!(
error.to_string(),
- r#"No field named "f1"."c1". Valid fields are "test"."f.c1",
"test"."test.c2""#
+ r#"No field named f1.c1. Valid fields are test."f.c1", test."test.c2""#
);
// however, enclosing it in double quotes is ok
diff --git a/datafusion/core/tests/sqllogictests/test_files/join.slt
b/datafusion/core/tests/sqllogictests/test_files/join.slt
index 5f1c067ece..b940232a74 100644
--- a/datafusion/core/tests/sqllogictests/test_files/join.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/join.slt
@@ -87,7 +87,7 @@ statement ok
set datafusion.execution.batch_size = 4096;
# left semi with wrong where clause
-query error DataFusion error: Schema error: No field named "t2"."t2_id". Valid
fields are "t1"."t1_id", "t1"."t1_name", "t1"."t1_int".
+query error DataFusion error: Schema error: No field named t2\.t2_id\. Valid
fields are t1\.t1_id, t1\.t1_name, t1\.t1_int\.
SELECT t1.t1_id, t1.t1_name, t1.t1_int
FROM t1
LEFT SEMI JOIN t2 ON t1.t1_id = t2.t2_id
diff --git a/datafusion/expr/src/expr_rewriter/mod.rs
b/datafusion/expr/src/expr_rewriter/mod.rs
index c0db5e1216..63b5d3ed67 100644
--- a/datafusion/expr/src/expr_rewriter/mod.rs
+++ b/datafusion/expr/src/expr_rewriter/mod.rs
@@ -339,7 +339,7 @@ mod test {
.to_string();
assert_eq!(
error,
- r#"Schema error: No field named "b". Valid fields are
"tableA"."a"."#
+ r#"Schema error: No field named b. Valid fields are "tableA".a."#
);
}
diff --git a/datafusion/sql/tests/integration_test.rs
b/datafusion/sql/tests/integration_test.rs
index f6e549b01f..99a6a90aa0 100644
--- a/datafusion/sql/tests/integration_test.rs
+++ b/datafusion/sql/tests/integration_test.rs
@@ -324,11 +324,11 @@ Dml: op=[Insert] table=[test_decimal]
#[rstest]
#[case::duplicate_columns(
"INSERT INTO test_decimal (id, price, price) VALUES (1, 2, 3), (4, 5, 6)",
- "Schema error: Schema contains duplicate unqualified field name \"price\""
+ "Schema error: Schema contains duplicate unqualified field name price"
)]
#[case::non_existing_column(
"INSERT INTO test_decimal (nonexistent, price) VALUES (1, 2), (4, 5)",
- "Schema error: No field named \"nonexistent\". Valid fields are \"id\",
\"price\"."
+ "Schema error: No field named nonexistent. Valid fields are id, price."
)]
#[case::type_mismatch(
"INSERT INTO test_decimal SELECT '2022-01-01',
to_timestamp('2022-01-01T12:00:00')",
@@ -1218,9 +1218,9 @@ fn
select_simple_aggregate_with_groupby_column_unselected() {
fn
select_simple_aggregate_with_groupby_and_column_in_group_by_does_not_exist() {
let sql = "SELECT SUM(age) FROM person GROUP BY doesnotexist";
let err = logical_plan(sql).expect_err("query should have failed");
- assert_eq!("Schema error: No field named \"doesnotexist\". Valid fields
are \"SUM(person.age)\", \
- \"person\".\"id\", \"person\".\"first_name\",
\"person\".\"last_name\", \"person\".\"age\", \"person\".\"state\", \
- \"person\".\"salary\", \"person\".\"birth_date\", \"person\".\"😀\".",
format!("{err}"));
+ assert_eq!("Schema error: No field named doesnotexist. Valid fields are
\"SUM(person.age)\", \
+ person.id, person.first_name, person.last_name, person.age,
person.state, \
+ person.salary, person.birth_date, person.\"😀\".", format!("{err}"));
}
#[test]
@@ -3072,7 +3072,7 @@ fn order_by_unaliased_name() {
#[test]
fn order_by_ambiguous_name() {
let sql = "select * from person a join person b using (id) order by age";
- let expected = "Schema error: Ambiguous reference to unqualified field
\"age\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field
age";
let err = logical_plan(sql).unwrap_err();
assert_eq!(err.to_string(), expected);
@@ -3081,7 +3081,7 @@ fn order_by_ambiguous_name() {
#[test]
fn group_by_ambiguous_name() {
let sql = "select max(id) from person a join person b using (id) group by
age";
- let expected = "Schema error: Ambiguous reference to unqualified field
\"age\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field
age";
let err = logical_plan(sql).unwrap_err();
assert_eq!(err.to_string(), expected);
@@ -3398,7 +3398,7 @@ fn test_ambiguous_column_references_in_on_join() {
INNER JOIN person as p2
ON id = 1";
- let expected = "Schema error: Ambiguous reference to unqualified field
\"id\"";
+ let expected = "Schema error: Ambiguous reference to unqualified field id";
// It should return error.
let result = logical_plan(sql);
@@ -4038,7 +4038,7 @@ fn assert_field_not_found(err: DataFusionError, name:
&str) {
match err {
DataFusionError::SchemaError { .. } => {
let msg = format!("{err}");
- let expected = format!("Schema error: No field named \"{name}\".");
+ let expected = format!("Schema error: No field named {name}.");
if !msg.starts_with(&expected) {
panic!("error [{msg}] did not start with [{expected}]");
}