This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new a1c60a1ba9 feat: Quote column names if required in error messages 
(#5778)
a1c60a1ba9 is described below

commit a1c60a1ba98e089d7551637f2a78663e66772d88
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Apr 5 21:45:27 2023 +0200

    feat: Quote column names if required in error messages (#5778)
    
    * feat: Quote column names if required in error messages
    
    * Update sqllogictest
    
    * Update test
    
    * Update datafusion/common/src/table_reference.rs
    
    Co-authored-by: Jeffrey <[email protected]>
    
    ---------
    
    Co-authored-by: Jeffrey <[email protected]>
---
 datafusion/common/src/column.rs                    | 14 ++--
 datafusion/common/src/dfschema.rs                  | 97 ++++++++++++----------
 datafusion/common/src/table_reference.rs           |  4 +-
 datafusion/common/src/utils.rs                     | 66 ++++++++++++++-
 datafusion/core/src/dataframe.rs                   |  2 +-
 datafusion/core/tests/dataframe.rs                 |  8 +-
 datafusion/core/tests/sql/idenfifers.rs            |  8 +-
 datafusion/core/tests/sql/references.rs            |  2 +-
 .../core/tests/sqllogictests/test_files/join.slt   |  2 +-
 datafusion/expr/src/expr_rewriter/mod.rs           |  2 +-
 datafusion/sql/tests/integration_test.rs           | 18 ++--
 11 files changed, 149 insertions(+), 74 deletions(-)

diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs
index 78c1857238..d138bb06cd 100644
--- a/datafusion/common/src/column.rs
+++ b/datafusion/common/src/column.rs
@@ -112,13 +112,15 @@ impl Column {
 
     /// Serialize column into a quoted flat name string
     pub fn quoted_flat_name(&self) -> String {
-        // TODO: quote identifiers only when special characters present
-        // see: https://github.com/apache/arrow-datafusion/issues/5523
         match &self.relation {
             Some(r) => {
-                format!("{}.{}", r.to_quoted_string(), 
quote_identifier(&self.name))
+                format!(
+                    "{}.{}",
+                    r.to_quoted_string(),
+                    quote_identifier(self.name.as_str())
+                )
             }
-            None => quote_identifier(&self.name),
+            None => quote_identifier(&self.name).to_string(),
         }
     }
 
@@ -405,7 +407,7 @@ mod tests {
                 &[],
             )
             .expect_err("should've failed to find field");
-        let expected = r#"Schema error: No field named "z". Valid fields are 
"t1"."a", "t1"."b", "t2"."c", "t2"."d", "t3"."a", "t3"."b", "t3"."c", "t3"."d", 
"t3"."e"."#;
+        let expected = r#"Schema error: No field named z. Valid fields are 
t1.a, t1.b, t2.c, t2.d, t3.a, t3.b, t3.c, t3.d, t3.e."#;
         assert_eq!(err.to_string(), expected);
 
         // ambiguous column reference
@@ -416,7 +418,7 @@ mod tests {
                 &[],
             )
             .expect_err("should've found ambiguous field");
-        let expected = "Schema error: Ambiguous reference to unqualified field 
\"a\"";
+        let expected = "Schema error: Ambiguous reference to unqualified field 
a";
         assert_eq!(err.to_string(), expected);
 
         Ok(())
diff --git a/datafusion/common/src/dfschema.rs 
b/datafusion/common/src/dfschema.rs
index 67a367c5ef..bb5820cdaa 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -735,6 +735,8 @@ impl DFField {
 
 #[cfg(test)]
 mod tests {
+    use crate::assert_contains;
+
     use super::*;
     use arrow::datatypes::DataType;
 
@@ -745,8 +747,28 @@ mod tests {
         // lookup with unqualified name "t1.c0"
         let err = schema.index_of_column(&col).unwrap_err();
         assert_eq!(
-            r#"Schema error: No field named "t1.c0". Valid fields are 
"t1"."c0", "t1"."c1"."#,
-            &format!("{err}")
+            err.to_string(),
+            "Schema error: No field named \"t1.c0\". Valid fields are t1.c0, 
t1.c1.",
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn quoted_qualifiers_in_name() -> Result<()> {
+        let col = Column::from_name("t1.c0");
+        let schema = DFSchema::try_from_qualified_schema(
+            "t1",
+            &Schema::new(vec![
+                Field::new("CapitalColumn", DataType::Boolean, true),
+                Field::new("field.with.period", DataType::Boolean, true),
+            ]),
+        )?;
+
+        // lookup with unqualified name "t1.c0"
+        let err = schema.index_of_column(&col).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Schema error: No field named \"t1.c0\". Valid fields are 
t1.\"CapitalColumn\", t1.\"field.with.period\".",
         );
         Ok(())
     }
@@ -819,11 +841,9 @@ mod tests {
         let left = DFSchema::try_from_qualified_schema("t1", 
&test_schema_1())?;
         let right = DFSchema::try_from_qualified_schema("t1", 
&test_schema_1())?;
         let join = left.join(&right);
-        assert!(join.is_err());
         assert_eq!(
-            "Schema error: Schema contains duplicate \
-        qualified field name \"t1\".\"c0\"",
-            &format!("{}", join.err().unwrap())
+            join.unwrap_err().to_string(),
+            "Schema error: Schema contains duplicate qualified field name 
t1.c0",
         );
         Ok(())
     }
@@ -833,11 +853,9 @@ mod tests {
         let left = DFSchema::try_from(test_schema_1())?;
         let right = DFSchema::try_from(test_schema_1())?;
         let join = left.join(&right);
-        assert!(join.is_err());
         assert_eq!(
-            "Schema error: Schema contains duplicate \
-        unqualified field name \"c0\"",
-            &format!("{}", join.err().unwrap())
+            join.unwrap_err().to_string(),
+            "Schema error: Schema contains duplicate unqualified field name 
c0",
         );
         Ok(())
     }
@@ -872,12 +890,9 @@ mod tests {
         let left = DFSchema::try_from_qualified_schema("t1", 
&test_schema_1())?;
         let right = DFSchema::try_from(test_schema_1())?;
         let join = left.join(&right);
-        assert!(join.is_err());
-        assert_eq!(
-            "Schema error: Schema contains qualified \
-        field name \"t1\".\"c0\" and unqualified field name \"c0\" which would 
be ambiguous",
-            &format!("{}", join.err().unwrap())
-        );
+        assert_contains!(join.unwrap_err().to_string(),
+                         "Schema error: Schema contains qualified \
+                          field name t1.c0 and unqualified field name c0 which 
would be ambiguous");
         Ok(())
     }
 
@@ -885,29 +900,28 @@ mod tests {
     #[test]
     fn helpful_error_messages() -> Result<()> {
         let schema = DFSchema::try_from_qualified_schema("t1", 
&test_schema_1())?;
-        let expected_help = "Valid fields are \"t1\".\"c0\", \"t1\".\"c1\".";
+        let expected_help = "Valid fields are t1.c0, t1.c1.";
         // Pertinent message parts
         let expected_err_msg = "Fully qualified field name 't1.c0'";
-        assert!(schema
-            .field_with_qualified_name(&TableReference::bare("x"), "y")
-            .unwrap_err()
-            .to_string()
-            .contains(expected_help));
-        assert!(schema
-            .field_with_unqualified_name("y")
-            .unwrap_err()
-            .to_string()
-            .contains(expected_help));
-        assert!(schema
-            .index_of("y")
-            .unwrap_err()
-            .to_string()
-            .contains(expected_help));
-        assert!(schema
-            .index_of("t1.c0")
-            .unwrap_err()
-            .to_string()
-            .contains(expected_err_msg));
+        assert_contains!(
+            schema
+                .field_with_qualified_name(&TableReference::bare("x"), "y")
+                .unwrap_err()
+                .to_string(),
+            expected_help
+        );
+        assert_contains!(
+            schema
+                .field_with_unqualified_name("y")
+                .unwrap_err()
+                .to_string(),
+            expected_help
+        );
+        assert_contains!(schema.index_of("y").unwrap_err().to_string(), 
expected_help);
+        assert_contains!(
+            schema.index_of("t1.c0").unwrap_err().to_string(),
+            expected_err_msg
+        );
         Ok(())
     }
 
@@ -916,16 +930,13 @@ mod tests {
         let schema = DFSchema::empty();
 
         let col = Column::from_qualified_name("t1.c0");
-        let err = schema.index_of_column(&col).err().unwrap();
-        assert_eq!(
-            r#"Schema error: No field named "t1"."c0"."#,
-            &format!("{err}")
-        );
+        let err = schema.index_of_column(&col).unwrap_err();
+        assert_eq!(err.to_string(), "Schema error: No field named t1.c0.");
 
         // the same check without qualifier
         let col = Column::from_name("c0");
         let err = schema.index_of_column(&col).err().unwrap();
-        assert_eq!(r#"Schema error: No field named "c0"."#, &format!("{err}"));
+        assert_eq!("Schema error: No field named c0.", err.to_string());
     }
 
     #[test]
diff --git a/datafusion/common/src/table_reference.rs 
b/datafusion/common/src/table_reference.rs
index e0558b4cdb..cd05f8082d 100644
--- a/datafusion/common/src/table_reference.rs
+++ b/datafusion/common/src/table_reference.rs
@@ -272,14 +272,14 @@ impl<'a> TableReference<'a> {
     /// ```
     /// # use datafusion_common::TableReference;
     /// let table_reference = TableReference::partial("myschema", "mytable");
-    /// assert_eq!(table_reference.to_quoted_string(), 
r#""myschema"."mytable""#);
+    /// assert_eq!(table_reference.to_quoted_string(), "myschema.mytable");
     ///
     /// let table_reference = TableReference::partial("MySchema", "MyTable");
     /// assert_eq!(table_reference.to_quoted_string(), 
r#""MySchema"."MyTable""#);
     /// ```
     pub fn to_quoted_string(&self) -> String {
         match self {
-            TableReference::Bare { table } => quote_identifier(table),
+            TableReference::Bare { table } => 
quote_identifier(table).to_string(),
             TableReference::Partial { schema, table } => {
                 format!("{}.{}", quote_identifier(schema), 
quote_identifier(table))
             }
diff --git a/datafusion/common/src/utils.rs b/datafusion/common/src/utils.rs
index a1226def8e..6872653fb4 100644
--- a/datafusion/common/src/utils.rs
+++ b/datafusion/common/src/utils.rs
@@ -24,6 +24,7 @@ use sqlparser::ast::Ident;
 use sqlparser::dialect::GenericDialect;
 use sqlparser::parser::{Parser, ParserError};
 use sqlparser::tokenizer::{Token, TokenWithLocation};
+use std::borrow::Cow;
 use std::cmp::Ordering;
 
 /// Given column vectors, returns row at `idx`.
@@ -166,8 +167,26 @@ where
 /// the identifier by replacing it with two double quotes
 ///
 /// e.g. identifier `tab.le"name` becomes `"tab.le""name"`
-pub fn quote_identifier(s: &str) -> String {
-    format!("\"{}\"", s.replace('"', "\"\""))
+pub fn quote_identifier(s: &str) -> Cow<str> {
+    if needs_quotes(s) {
+        Cow::Owned(format!("\"{}\"", s.replace('"', "\"\"")))
+    } else {
+        Cow::Borrowed(s)
+    }
+}
+
+/// returns true if this identifier needs quotes
+fn needs_quotes(s: &str) -> bool {
+    let mut chars = s.chars();
+
+    // first char can not be a number unless escaped
+    if let Some(first_char) = chars.next() {
+        if !(first_char.is_ascii_lowercase() || first_char == '_') {
+            return true;
+        }
+    }
+
+    !chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
 }
 
 // TODO: remove when can use 
https://github.com/sqlparser-rs/sqlparser-rs/issues/805
@@ -464,4 +483,47 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_quote_identifier() -> Result<()> {
+        let cases = vec![
+            ("foo", r#"foo"#),
+            ("_foo", r#"_foo"#),
+            ("foo_bar", r#"foo_bar"#),
+            ("foo-bar", r#""foo-bar""#),
+            // name itself has a period, needs to be quoted
+            ("foo.bar", r#""foo.bar""#),
+            ("Foo", r#""Foo""#),
+            ("Foo.Bar", r#""Foo.Bar""#),
+            // name starting with a number needs to be quoted
+            ("test1", r#"test1"#),
+            ("1test", r#""1test""#),
+        ];
+
+        for (identifier, quoted_identifier) in cases {
+            println!("input: 
\n{identifier}\nquoted_identifier:\n{quoted_identifier}");
+
+            assert_eq!(quote_identifier(identifier), quoted_identifier);
+
+            // When parsing the quoted identifier, it should be a
+            // a single identifier without normalization, and not in multiple 
parts
+            let quote_style = if quoted_identifier.starts_with('"') {
+                Some('"')
+            } else {
+                None
+            };
+
+            let expected_parsed = vec![Ident {
+                value: identifier.to_string(),
+                quote_style,
+            }];
+
+            assert_eq!(
+                parse_identifiers(quoted_identifier).unwrap(),
+                expected_parsed
+            );
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index e3e2987d3d..d7ce056a33 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -1371,7 +1371,7 @@ mod tests {
         let join = left
             .join_on(right, JoinType::Inner, [col("c1").eq(col("c1"))])
             .expect_err("join didn't fail check");
-        let expected = "Schema error: Ambiguous reference to unqualified field 
\"c1\"";
+        let expected = "Schema error: Ambiguous reference to unqualified field 
c1";
         assert_eq!(join.to_string(), expected);
 
         Ok(())
diff --git a/datafusion/core/tests/dataframe.rs 
b/datafusion/core/tests/dataframe.rs
index 23c6623ab1..4b2daa100b 100644
--- a/datafusion/core/tests/dataframe.rs
+++ b/datafusion/core/tests/dataframe.rs
@@ -331,7 +331,7 @@ async fn sort_on_ambiguous_column() -> Result<()> {
         .sort(vec![col("b").sort(true, true)])
         .unwrap_err();
 
-    let expected = "Schema error: Ambiguous reference to unqualified field 
\"b\"";
+    let expected = "Schema error: Ambiguous reference to unqualified field b";
     assert_eq!(err.to_string(), expected);
     Ok(())
 }
@@ -350,7 +350,7 @@ async fn group_by_ambiguous_column() -> Result<()> {
         .aggregate(vec![col("b")], vec![max(col("a"))])
         .unwrap_err();
 
-    let expected = "Schema error: Ambiguous reference to unqualified field 
\"b\"";
+    let expected = "Schema error: Ambiguous reference to unqualified field b";
     assert_eq!(err.to_string(), expected);
     Ok(())
 }
@@ -369,7 +369,7 @@ async fn filter_on_ambiguous_column() -> Result<()> {
         .filter(col("b").eq(lit(1)))
         .unwrap_err();
 
-    let expected = "Schema error: Ambiguous reference to unqualified field 
\"b\"";
+    let expected = "Schema error: Ambiguous reference to unqualified field b";
     assert_eq!(err.to_string(), expected);
     Ok(())
 }
@@ -388,7 +388,7 @@ async fn select_ambiguous_column() -> Result<()> {
         .select(vec![col("b")])
         .unwrap_err();
 
-    let expected = "Schema error: Ambiguous reference to unqualified field 
\"b\"";
+    let expected = "Schema error: Ambiguous reference to unqualified field b";
     assert_eq!(err.to_string(), expected);
     Ok(())
 }
diff --git a/datafusion/core/tests/sql/idenfifers.rs 
b/datafusion/core/tests/sql/idenfifers.rs
index 1b57f60bd4..23fb26edde 100644
--- a/datafusion/core/tests/sql/idenfifers.rs
+++ b/datafusion/core/tests/sql/idenfifers.rs
@@ -211,28 +211,28 @@ async fn case_insensitive_in_sql_errors() {
         .await
         .unwrap_err()
         .to_string();
-    assert_contains!(actual, r#"No field named "column1""#);
+    assert_contains!(actual, r#"No field named column1"#);
 
     let actual = ctx
         .sql("SELECT Column1 from test")
         .await
         .unwrap_err()
         .to_string();
-    assert_contains!(actual, r#"No field named "column1""#);
+    assert_contains!(actual, r#"No field named column1"#);
 
     let actual = ctx
         .sql("SELECT column1 from test")
         .await
         .unwrap_err()
         .to_string();
-    assert_contains!(actual, r#"No field named "column1""#);
+    assert_contains!(actual, r#"No field named column1"#);
 
     let actual = ctx
         .sql(r#"SELECT "column1" from test"#)
         .await
         .unwrap_err()
         .to_string();
-    assert_contains!(actual, r#"No field named "column1""#);
+    assert_contains!(actual, r#"No field named column1"#);
 
     // This should pass (note the quotes)
     ctx.sql(r#"SELECT "Column1" from test"#).await.unwrap();
diff --git a/datafusion/core/tests/sql/references.rs 
b/datafusion/core/tests/sql/references.rs
index 335bc63086..60191e5213 100644
--- a/datafusion/core/tests/sql/references.rs
+++ b/datafusion/core/tests/sql/references.rs
@@ -67,7 +67,7 @@ async fn qualified_table_references_and_fields() -> 
Result<()> {
     let error = ctx.sql(sql).await.unwrap_err();
     assert_contains!(
         error.to_string(),
-        r#"No field named "f1"."c1". Valid fields are "test"."f.c1", 
"test"."test.c2""#
+        r#"No field named f1.c1. Valid fields are test."f.c1", test."test.c2""#
     );
 
     // however, enclosing it in double quotes is ok
diff --git a/datafusion/core/tests/sqllogictests/test_files/join.slt 
b/datafusion/core/tests/sqllogictests/test_files/join.slt
index 5f1c067ece..b940232a74 100644
--- a/datafusion/core/tests/sqllogictests/test_files/join.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/join.slt
@@ -87,7 +87,7 @@ statement ok
 set datafusion.execution.batch_size = 4096;
 
 # left semi with wrong where clause
-query error DataFusion error: Schema error: No field named "t2"."t2_id". Valid 
fields are "t1"."t1_id", "t1"."t1_name", "t1"."t1_int".
+query error DataFusion error: Schema error: No field named t2\.t2_id\. Valid 
fields are t1\.t1_id, t1\.t1_name, t1\.t1_int\.
 SELECT t1.t1_id, t1.t1_name, t1.t1_int
 FROM t1
        LEFT SEMI JOIN t2 ON t1.t1_id = t2.t2_id
diff --git a/datafusion/expr/src/expr_rewriter/mod.rs 
b/datafusion/expr/src/expr_rewriter/mod.rs
index c0db5e1216..63b5d3ed67 100644
--- a/datafusion/expr/src/expr_rewriter/mod.rs
+++ b/datafusion/expr/src/expr_rewriter/mod.rs
@@ -339,7 +339,7 @@ mod test {
                 .to_string();
         assert_eq!(
             error,
-            r#"Schema error: No field named "b". Valid fields are 
"tableA"."a"."#
+            r#"Schema error: No field named b. Valid fields are "tableA".a."#
         );
     }
 
diff --git a/datafusion/sql/tests/integration_test.rs 
b/datafusion/sql/tests/integration_test.rs
index f6e549b01f..99a6a90aa0 100644
--- a/datafusion/sql/tests/integration_test.rs
+++ b/datafusion/sql/tests/integration_test.rs
@@ -324,11 +324,11 @@ Dml: op=[Insert] table=[test_decimal]
 #[rstest]
 #[case::duplicate_columns(
     "INSERT INTO test_decimal (id, price, price) VALUES (1, 2, 3), (4, 5, 6)",
-    "Schema error: Schema contains duplicate unqualified field name \"price\""
+    "Schema error: Schema contains duplicate unqualified field name price"
 )]
 #[case::non_existing_column(
     "INSERT INTO test_decimal (nonexistent, price) VALUES (1, 2), (4, 5)",
-    "Schema error: No field named \"nonexistent\". Valid fields are \"id\", 
\"price\"."
+    "Schema error: No field named nonexistent. Valid fields are id, price."
 )]
 #[case::type_mismatch(
     "INSERT INTO test_decimal SELECT '2022-01-01', 
to_timestamp('2022-01-01T12:00:00')",
@@ -1218,9 +1218,9 @@ fn 
select_simple_aggregate_with_groupby_column_unselected() {
 fn 
select_simple_aggregate_with_groupby_and_column_in_group_by_does_not_exist() {
     let sql = "SELECT SUM(age) FROM person GROUP BY doesnotexist";
     let err = logical_plan(sql).expect_err("query should have failed");
-    assert_eq!("Schema error: No field named \"doesnotexist\". Valid fields 
are \"SUM(person.age)\", \
-        \"person\".\"id\", \"person\".\"first_name\", 
\"person\".\"last_name\", \"person\".\"age\", \"person\".\"state\", \
-        \"person\".\"salary\", \"person\".\"birth_date\", \"person\".\"😀\".", 
format!("{err}"));
+    assert_eq!("Schema error: No field named doesnotexist. Valid fields are 
\"SUM(person.age)\", \
+        person.id, person.first_name, person.last_name, person.age, 
person.state, \
+        person.salary, person.birth_date, person.\"😀\".", format!("{err}"));
 }
 
 #[test]
@@ -3072,7 +3072,7 @@ fn order_by_unaliased_name() {
 #[test]
 fn order_by_ambiguous_name() {
     let sql = "select * from person a join person b using (id) order by age";
-    let expected = "Schema error: Ambiguous reference to unqualified field 
\"age\"";
+    let expected = "Schema error: Ambiguous reference to unqualified field 
age";
 
     let err = logical_plan(sql).unwrap_err();
     assert_eq!(err.to_string(), expected);
@@ -3081,7 +3081,7 @@ fn order_by_ambiguous_name() {
 #[test]
 fn group_by_ambiguous_name() {
     let sql = "select max(id) from person a join person b using (id) group by 
age";
-    let expected = "Schema error: Ambiguous reference to unqualified field 
\"age\"";
+    let expected = "Schema error: Ambiguous reference to unqualified field 
age";
 
     let err = logical_plan(sql).unwrap_err();
     assert_eq!(err.to_string(), expected);
@@ -3398,7 +3398,7 @@ fn test_ambiguous_column_references_in_on_join() {
             INNER JOIN person as p2
             ON id = 1";
 
-    let expected = "Schema error: Ambiguous reference to unqualified field 
\"id\"";
+    let expected = "Schema error: Ambiguous reference to unqualified field id";
 
     // It should return error.
     let result = logical_plan(sql);
@@ -4038,7 +4038,7 @@ fn assert_field_not_found(err: DataFusionError, name: 
&str) {
     match err {
         DataFusionError::SchemaError { .. } => {
             let msg = format!("{err}");
-            let expected = format!("Schema error: No field named \"{name}\".");
+            let expected = format!("Schema error: No field named {name}.");
             if !msg.starts_with(&expected) {
                 panic!("error [{msg}] did not start with [{expected}]");
             }

Reply via email to