alamb commented on code in PR #5343: URL: https://github.com/apache/arrow-datafusion/pull/5343#discussion_r1119321888
########## datafusion/common/src/dfschema.rs: ########## @@ -200,19 +201,19 @@ impl DFSchema { // field to lookup is qualified. // current field is qualified and not shared between relations, compare both // qualifier and name. - (Some(q), Some(field_q)) => q == field_q && field.name() == name, + (Some(q), Some(field_q)) => { + q.resolved_eq(&field_q.as_table_reference()) && field.name() == name + } // field to lookup is qualified but current field is unqualified. (Some(qq), None) => { // the original field may now be aliased with a name that matches the // original qualified name - let table_ref = TableReference::parse_str(field.name().as_str()); - match table_ref { - TableReference::Partial { schema, table } => { - schema == qq && table == name - } - TableReference::Full { schema, table, .. } => { - schema == qq && table == name - } + let column = Column::from_qualified_name(field.name()); Review Comment: this seems like a nicer API for sure ########## datafusion/expr/src/logical_plan/builder.rs: ########## @@ -545,14 +545,22 @@ impl LogicalPlanBuilder { match (&l.relation, &r.relation) { (Some(lr), Some(rr)) => { - let l_is_left = - self.plan.schema().field_with_qualified_name(lr, &l.name); - let l_is_right = - right.schema().field_with_qualified_name(lr, &l.name); - let r_is_left = - self.plan.schema().field_with_qualified_name(rr, &r.name); - let r_is_right = - right.schema().field_with_qualified_name(rr, &r.name); + let l_is_left = self.plan.schema().field_with_qualified_name( + &lr.as_table_reference(), Review Comment: I tried playing around with the signature of `field_with_qualified_name` and I couldn't find a way to make `lr` still work. 🤷 ########## datafusion/common/src/table_reference.rs: ########## @@ -226,57 +258,156 @@ impl<'a> TableReference<'a> { } } -// TODO: remove when can use https://github.com/sqlparser-rs/sqlparser-rs/issues/805 -fn parse_identifiers(s: &str) -> Result<Vec<Ident>> { - let dialect = GenericDialect; - let mut parser = Parser::new(&dialect).try_with_sql(s)?; - let mut idents = vec![]; - - // expecting at least one word for identifier - match parser.next_token_no_skip() { - Some(TokenWithLocation { - token: Token::Word(w), - .. - }) => idents.push(w.to_ident()), - Some(TokenWithLocation { token, .. }) => { - return Err(ParserError::ParserError(format!( - "Unexpected token in identifier: {token}" - )))? - } - None => { - return Err(ParserError::ParserError( - "Empty input when parsing identifier".to_string(), - ))? +/// Represents a path to a table that may require further resolution +/// that owns the underlying names +#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub enum OwnedTableReference { + /// An unqualified table reference, e.g. "table" + Bare { + /// The table name + table: String, + }, + /// A partially resolved table reference, e.g. "schema.table" + Partial { + /// The schema containing the table + schema: String, + /// The table name + table: String, + }, + /// A fully resolved table reference, e.g. "catalog.schema.table" + Full { + /// The catalog (aka database) containing the table + catalog: String, + /// The schema containing the table + schema: String, + /// The table name + table: String, + }, +} + +impl std::fmt::Display for OwnedTableReference { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + OwnedTableReference::Bare { table } => write!(f, "{table}"), + OwnedTableReference::Partial { schema, table } => { + write!(f, "{schema}.{table}") + } + OwnedTableReference::Full { + catalog, + schema, + table, + } => write!(f, "{catalog}.{schema}.{table}"), } - }; - - while let Some(TokenWithLocation { token, .. }) = parser.next_token_no_skip() { - match token { - // ensure that optional period is succeeded by another identifier - Token::Period => match parser.next_token_no_skip() { - Some(TokenWithLocation { - token: Token::Word(w), - .. - }) => idents.push(w.to_ident()), - Some(TokenWithLocation { token, .. }) => { - return Err(ParserError::ParserError(format!( - "Unexpected token following period in identifier: {token}" - )))? - } - None => { - return Err(ParserError::ParserError( - "Trailing period in identifier".to_string(), - ))? - } + } +} + +impl OwnedTableReference { + /// Return a `TableReference` view of this `OwnedTableReference` + pub fn as_table_reference(&self) -> TableReference<'_> { + match self { + Self::Bare { table } => TableReference::Bare { + table: table.into(), + }, + Self::Partial { schema, table } => TableReference::Partial { + schema: schema.into(), + table: table.into(), + }, + Self::Full { + catalog, + schema, + table, + } => TableReference::Full { + catalog: catalog.into(), + schema: schema.into(), + table: table.into(), }, - _ => { - return Err(ParserError::ParserError(format!( - "Unexpected token in identifier: {token}" - )))? + } + } + + /// Retrieve the actual table name, regardless of qualification + pub fn table(&self) -> &str { + match self { + Self::Full { table, .. } + | Self::Partial { table, .. } + | Self::Bare { table } => table, + } + } + + /// Forms a string where the identifiers are quoted + pub fn to_quoted_string(&self) -> String { + match self { + OwnedTableReference::Bare { table } => quote_identifier(table), + OwnedTableReference::Partial { schema, table } => { + format!("{}.{}", quote_identifier(schema), quote_identifier(table)) } + OwnedTableReference::Full { + catalog, + schema, + table, + } => format!( + "{}.{}.{}", + quote_identifier(catalog), + quote_identifier(schema), + quote_identifier(table) + ), } } - Ok(idents) +} + +impl PartialEq<TableReference<'_>> for OwnedTableReference { + fn eq(&self, other: &TableReference<'_>) -> bool { + self.as_table_reference().eq(other) + } +} + +impl PartialEq<OwnedTableReference> for TableReference<'_> { + fn eq(&self, other: &OwnedTableReference) -> bool { + self.eq(&other.as_table_reference()) + } +} + +/// Parse a `&str` into a OwnedTableReference +impl From<&str> for OwnedTableReference { + fn from(s: &str) -> Self { + let table_reference: TableReference = s.into(); + table_reference.to_owned_reference() + } +} + +/// Parse a `String` into a OwnedTableReference +impl From<String> for OwnedTableReference { + fn from(s: String) -> Self { + Self::from(s.as_str()) + } +} + +/// Parse a `&String` into a OwnedTableReference +impl From<&String> for OwnedTableReference { + fn from(s: &String) -> Self { + Self::from(s.as_str()) + } +} + +/// Parse a `&String` into a OwnedTableReference +impl From<&OwnedTableReference> for OwnedTableReference { + fn from(s: &OwnedTableReference) -> Self { + s.clone() + } +} + +/// Parse a `TableReference` into a OwnedTableReference +impl From<&'_ TableReference<'_>> for OwnedTableReference { Review Comment: 👍 ########## datafusion/optimizer/src/push_down_projection.rs: ########## @@ -497,8 +494,10 @@ fn push_down_scan( let schema = scan.source.schema(); let mut projection: BTreeSet<usize> = required_columns .iter() + // TODO: change scan.table_name from String? Review Comment: maybe worth a follow on ticket? ########## docs/source/user-guide/example-usage.md: ########## @@ -118,8 +118,8 @@ async fn main() -> datafusion::error::Result<()> { let ctx = SessionContext::new(); let df = ctx.read_csv("tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; - let df = df.filter(col("A").lt_eq(col("c")))? - .aggregate(vec![col("A")], vec![min(col("b"))])? + let df = df.filter(col("\"A\"").lt_eq(col("c")))? + .aggregate(vec![col("\"A\"")], vec![min(col("b"))])? .limit(0, Some(100))?; Review Comment: This is probably a pretty major change for people (that `col()` will now parse / normalize the column names according to SQL semantics). However, it does have the nice side benefit that the handling of `.` is no longer treated differently in `col()` What do you think about possibly 1. changing `col` so that it forms a unqualified reference always (aka does not respect `.`) 2. adding some new function like `ident()` or `parse_col()` that treats column like a sql identifier and correctly splits based on `.` etc? ########## datafusion/common/src/dfschema.rs: ########## @@ -677,7 +687,7 @@ mod tests { // lookup with unqualified name "t1.c0" let err = schema.index_of_column(&col).err().unwrap(); assert_eq!( - "Schema error: No field named 't1.c0'. Valid fields are 't1'.'c0', 't1'.'c1'.", + r#"Schema error: No field named "t1.c0". Valid fields are "t1"."c0", "t1"."c1"."#, Review Comment: I agree the new change is better. We can maybe go a step farther and only put `"` if the identifier would be normalized (e.g. it has something other than lowercase ascii letters) -- as a potential follow on PR ########## datafusion/common/src/dfschema.rs: ########## @@ -573,21 +580,21 @@ impl ExprSchema for DFSchema { #[derive(Debug, Clone, PartialEq, Eq)] pub struct DFField { /// Optional qualifier (usually a table or relation name) - qualifier: Option<String>, + qualifier: Option<OwnedTableReference>, /// Arrow field definition field: Field, } impl DFField { /// Creates a new `DFField` - pub fn new( - qualifier: Option<&str>, + pub fn new<R: Into<OwnedTableReference>>( + qualifier: Option<R>, Review Comment: > or maintain old method and create a new one?) Yeah, I agree it is strange to see `field_not_found::<&str>` and `DFField::new::<&str>` when there is no qualifier Since all sites with `None` as the qualifier are going to need to be changed anyways, what about making different functions for creating qualified and unqualified fields? Something like this: ```rust impl DFField { fn new<R: Into<OwnedTableReference>>( qualifier:R , name: &str, data_type: DataType, nullable: bool, ) -> Self {..} fn new_unqualified( name: &str, data_type: DataType, nullable: bool, ) -> Self {..} } ``` ? ########## datafusion/sql/src/expr/identifier.rs: ########## @@ -69,44 +105,100 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { })?; Ok(Expr::ScalarVariable(ty, var_names)) } else { - // only support "schema.table" type identifiers here - let (name, relation) = match idents_to_table_reference( - ids, - self.options.enable_ident_normalization, - )? { - OwnedTableReference::Partial { schema, table } => (table, schema), - r @ OwnedTableReference::Bare { .. } - | r @ OwnedTableReference::Full { .. } => { - return Err(DataFusionError::Plan(format!( - "Unsupported compound identifier '{r:?}'", - ))); - } - }; + let ids = ids + .into_iter() + .map(|id| { + if self.options.enable_ident_normalization { + normalize_ident(id) + } else { + id.value + } + }) + .collect::<Vec<_>>(); - // Try and find the reference in schema - match schema.field_with_qualified_name(&relation, &name) { - Ok(_) => { - // found an exact match on a qualified name so this is a table.column identifier - Ok(Expr::Column(Column { - relation: Some(relation), - name, - })) + // Possibilities we search with, in order from top to bottom for each len: Review Comment: Could you write some more unit tests for these cases (the comments are great, but it would be better if they were tests so they can't get out of sync as easily) ########## datafusion/common/src/column.rs: ########## @@ -27,15 +28,18 @@ use std::sync::Arc; /// A named reference to a qualified field in a schema. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Column { - /// relation/table name. - pub relation: Option<String>, + /// relation/table reference. + pub relation: Option<OwnedTableReference>, /// field/column name. pub name: String, } impl Column { /// Create Column from optional qualifier and name - pub fn new(relation: Option<impl Into<String>>, name: impl Into<String>) -> Self { + pub fn new( + relation: Option<impl Into<OwnedTableReference>>, Review Comment: maybe we can update the docstring of this function to explain the new behavior (aka that relations will be parsed and normalized -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org