iffyio commented on code in PR #1576: URL: https://github.com/apache/datafusion-sqlparser-rs/pull/1576#discussion_r1873272299
########## src/tokenizer.rs: ########## @@ -1075,25 +1075,56 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::DoubleQuotedString(s))) } // delimited (quoted) identifier + quote_start if self.dialect.is_delimited_identifier_start(ch) => { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + Ok(Some(Token::make_word(&word, Some(quote_start)))) + } + // special (quoted) identifier quote_start - if self.dialect.is_delimited_identifier_start(ch) + if self + .dialect + .is_nested_delimited_identifier_start(quote_start) && self .dialect - .is_proper_identifier_inside_quotes(chars.peekable.clone()) => + .nested_delimited_identifier(chars.peekable.clone()) + .is_some() => { - let error_loc = chars.location(); - chars.next(); // consume the opening quote + let (quote_start, nested_quote_start) = self + .dialect + .nested_delimited_identifier(chars.peekable.clone()) + .unwrap(); Review Comment: ```suggestion ``` we can return an error here? ########## src/dialect/mod.rs: ########## @@ -128,14 +128,23 @@ pub trait Dialect: Debug + Any { ch == '"' || ch == '`' } - /// Return the character used to quote identifiers. - fn identifier_quote_style(&self, _identifier: &str) -> Option<char> { + /// Determine if a character starts a potential nested quoted identifier. + /// RedShift support old way of quotation with `[` and it can cover even nested quoted identifier. Review Comment: ```suggestion /// Example: RedShift supports the following quote styles to all mean the same thing: /// ```sql /// SELECT 1 AS foo; /// SELECT 1 AS "foo"; /// SELECT 1 AS [foo]; /// SELECT 1 AS ["foo"]; /// ``` ``` Thinking something like this to clarify the behavior? ########## tests/sqlparser_redshift.rs: ########## @@ -279,6 +279,31 @@ fn test_redshift_json_path() { }, expr_from_projection(only(&select.projection)) ); + + let sql = r#"SELECT db1.sc1.tbl1.col1[0]."id" FROM customer_orders_lineitem"#; + let select = dialects.verified_only_select(sql); + assert_eq!( + &Expr::JsonAccess { + value: Box::new(Expr::CompoundIdentifier(vec![ + Ident::new("db1"), + Ident::new("sc1"), + Ident::new("tbl1"), + Ident::new("col1") + ])), + path: JsonPath { + path: vec![ + JsonPathElem::Bracket { + key: Expr::Value(Value::Number("0".parse().unwrap(), false)) Review Comment: ```suggestion key: Expr::Value(number("0")) ``` ########## src/tokenizer.rs: ########## @@ -1075,25 +1075,56 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::DoubleQuotedString(s))) } // delimited (quoted) identifier + quote_start if self.dialect.is_delimited_identifier_start(ch) => { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + Ok(Some(Token::make_word(&word, Some(quote_start)))) + } + // special (quoted) identifier quote_start - if self.dialect.is_delimited_identifier_start(ch) + if self + .dialect + .is_nested_delimited_identifier_start(quote_start) && self .dialect - .is_proper_identifier_inside_quotes(chars.peekable.clone()) => + .nested_delimited_identifier(chars.peekable.clone()) + .is_some() => { - let error_loc = chars.location(); - chars.next(); // consume the opening quote + let (quote_start, nested_quote_start) = self + .dialect + .nested_delimited_identifier(chars.peekable.clone()) + .unwrap(); + + let Some(nested_quote_start) = nested_quote_start else { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + return Ok(Some(Token::make_word(&word, Some(quote_start)))); + }; + + let mut word = vec![]; let quote_end = Word::matching_end_quote(quote_start); - let (s, last_char) = self.parse_quoted_ident(chars, quote_end); + let nested_quote_end = Word::matching_end_quote(nested_quote_start); + let error_loc = chars.location(); - if last_char == Some(quote_end) { - Ok(Some(Token::make_word(&s, Some(quote_start)))) - } else { - self.tokenizer_error( + chars.next(); // skip the first delimiter + word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); + if chars.peek() != Some(&nested_quote_start) { + return self.tokenizer_error( + error_loc, + format!("Expected nested delimiter '{nested_quote_start}' before EOF."), + ); + } + word.push(format!("{nested_quote_start}")); Review Comment: ```suggestion word.push(nested_quote_start.into())); ``` ########## src/dialect/mod.rs: ########## @@ -128,14 +128,23 @@ pub trait Dialect: Debug + Any { ch == '"' || ch == '`' } - /// Return the character used to quote identifiers. - fn identifier_quote_style(&self, _identifier: &str) -> Option<char> { + /// Determine if a character starts a potential nested quoted identifier. + /// RedShift support old way of quotation with `[` and it can cover even nested quoted identifier. + fn is_nested_delimited_identifier_start(&self, _ch: char) -> bool { + false + } + + /// Determine if nested quoted characters are presented + fn nested_delimited_identifier( Review Comment: ```suggestion fn peek_nested_delimited_identifier_quotes( ``` ########## src/tokenizer.rs: ########## @@ -1075,25 +1075,56 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::DoubleQuotedString(s))) } // delimited (quoted) identifier + quote_start if self.dialect.is_delimited_identifier_start(ch) => { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + Ok(Some(Token::make_word(&word, Some(quote_start)))) + } + // special (quoted) identifier Review Comment: ```suggestion // Potentially nested delimited (quoted) identifier ``` ########## src/dialect/redshift.rs: ########## @@ -32,21 +32,36 @@ pub struct RedshiftSqlDialect {} // in the Postgres dialect, the query will be parsed as an array, while in the Redshift dialect it will // be a json path impl Dialect for RedshiftSqlDialect { - fn is_delimited_identifier_start(&self, ch: char) -> bool { - ch == '"' || ch == '[' + fn is_nested_delimited_identifier_start(&self, ch: char) -> bool { + ch == '[' } - /// Determine if quoted characters are proper for identifier + /// Determine if quoted characters are looks like special case of quotation begining with `[`. Review Comment: Maybe we can copy over the suggested dialect method comment here since its already describing redshift. ########## tests/sqlparser_redshift.rs: ########## @@ -353,3 +378,20 @@ fn test_parse_json_path_from() { _ => panic!(), } } + +#[test] +fn test_parse_select_numbered_columns() { Review Comment: thinking a couple cases we can include? ```sql SELECT 1 AS ["1]; SELECT 1 AS [ " 1 " ]; ``` ########## src/dialect/mod.rs: ########## @@ -128,14 +128,23 @@ pub trait Dialect: Debug + Any { ch == '"' || ch == '`' } - /// Return the character used to quote identifiers. - fn identifier_quote_style(&self, _identifier: &str) -> Option<char> { + /// Determine if a character starts a potential nested quoted identifier. + /// RedShift support old way of quotation with `[` and it can cover even nested quoted identifier. + fn is_nested_delimited_identifier_start(&self, _ch: char) -> bool { + false + } + + /// Determine if nested quoted characters are presented Review Comment: ```suggestion /// Only applicable whenever [`Self::is_nested_delimited_identifier_start`] returns true /// If the next sequence of tokens potentially represent a nested identifier, then this method /// returns a tuple containing the outer quote style, and if present, the inner (nested) quote style. /// /// Example (Redshift): /// ```text /// `["foo"]` => (Some(`[`), Some(`"`)) /// `[foo]` => (Some(`[`), None) /// `"foo"` => None /// ``` ``` Similarly, to clarify the behavior with an example since it wouldn't be intuitive otherwise ########## tests/sqlparser_redshift.rs: ########## @@ -353,3 +378,20 @@ fn test_parse_json_path_from() { _ => panic!(), } } + +#[test] +fn test_parse_select_numbered_columns() { + redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1" FROM a"#); + // RedShift specific case - quoted identifier inside square bracket Review Comment: ```suggestion ``` this one seems to be already covered by `test_parse_create_numbered_columns`? ########## src/tokenizer.rs: ########## @@ -1075,25 +1075,56 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::DoubleQuotedString(s))) } // delimited (quoted) identifier + quote_start if self.dialect.is_delimited_identifier_start(ch) => { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + Ok(Some(Token::make_word(&word, Some(quote_start)))) + } + // special (quoted) identifier quote_start - if self.dialect.is_delimited_identifier_start(ch) + if self + .dialect + .is_nested_delimited_identifier_start(quote_start) && self .dialect - .is_proper_identifier_inside_quotes(chars.peekable.clone()) => + .nested_delimited_identifier(chars.peekable.clone()) + .is_some() => { - let error_loc = chars.location(); - chars.next(); // consume the opening quote + let (quote_start, nested_quote_start) = self + .dialect + .nested_delimited_identifier(chars.peekable.clone()) + .unwrap(); + + let Some(nested_quote_start) = nested_quote_start else { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + return Ok(Some(Token::make_word(&word, Some(quote_start)))); + }; + + let mut word = vec![]; let quote_end = Word::matching_end_quote(quote_start); - let (s, last_char) = self.parse_quoted_ident(chars, quote_end); + let nested_quote_end = Word::matching_end_quote(nested_quote_start); + let error_loc = chars.location(); - if last_char == Some(quote_end) { - Ok(Some(Token::make_word(&s, Some(quote_start)))) - } else { - self.tokenizer_error( + chars.next(); // skip the first delimiter + word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); + if chars.peek() != Some(&nested_quote_start) { + return self.tokenizer_error( + error_loc, + format!("Expected nested delimiter '{nested_quote_start}' before EOF."), + ); + } + word.push(format!("{nested_quote_start}")); + word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?); + word.push(format!("{nested_quote_end}")); Review Comment: ```suggestion word.push(nested_quote_end.into()); ``` ########## tests/sqlparser_redshift.rs: ########## @@ -353,3 +378,20 @@ fn test_parse_json_path_from() { _ => panic!(), } } + +#[test] +fn test_parse_select_numbered_columns() { + redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1" FROM a"#); + // RedShift specific case - quoted identifier inside square bracket + redshift().verified_stmt(r#"SELECT 1 AS ["1"] FROM a"#); + redshift().verified_stmt(r#"SELECT 1 AS ["[="] FROM a"#); + redshift().verified_stmt(r#"SELECT 1 AS ["=]"] FROM a"#); + redshift().verified_stmt(r#"SELECT 1 AS ["a[b]"] FROM a"#); +} + +#[test] +fn test_parse_create_numbered_columns() { + redshift_and_generic().verified_stmt( + r#"CREATE TABLE test_table_1 ("1" INT, "d" VARCHAR(155), "2" DOUBLE PRECISION)"#, + ); +} Review Comment: Can we incorporate this as a test case under the `parse_delimited_identifiers` test? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org