This is an automated email from the ASF dual-hosted git repository.
iffyio pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 7867ba3c Redshift: Fix parsing for quoted numbered columns (#1576)
7867ba3c is described below
commit 7867ba3cf04c9c8324bfa26403945f0d53c2119a
Author: Aleksei Piianin <[email protected]>
AuthorDate: Sun Dec 15 10:56:11 2024 +0100
Redshift: Fix parsing for quoted numbered columns (#1576)
---
src/dialect/mod.rs | 53 ++++++++++++++++++++++++-------
src/dialect/redshift.rs | 48 ++++++++++++++++++++++------
src/tokenizer.rs | 77 +++++++++++++++++++++++++++++++++++++++------
tests/sqlparser_redshift.rs | 58 +++++++++++++++++++++++++++++++---
4 files changed, 200 insertions(+), 36 deletions(-)
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index 8cce6a35..c32b763a 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -128,14 +128,39 @@ pub trait Dialect: Debug + Any {
ch == '"' || ch == '`'
}
- /// Return the character used to quote identifiers.
- fn identifier_quote_style(&self, _identifier: &str) -> Option<char> {
+ /// Determine if a character starts a potential nested quoted identifier.
+ /// Example: RedShift supports the following quote styles to all mean the
same thing:
+ /// ```sql
+ /// SELECT 1 AS foo;
+ /// SELECT 1 AS "foo";
+ /// SELECT 1 AS [foo];
+ /// SELECT 1 AS ["foo"];
+ /// ```
+ fn is_nested_delimited_identifier_start(&self, _ch: char) -> bool {
+ false
+ }
+
+ /// Only applicable whenever
[`Self::is_nested_delimited_identifier_start`] returns true
+ /// If the next sequence of tokens potentially represent a nested
identifier, then this method
+ /// returns a tuple containing the outer quote style, and if present, the
inner (nested) quote style.
+ ///
+ /// Example (Redshift):
+ /// ```text
+ /// `["foo"]` => Some(`[`, Some(`"`))
+ /// `[foo]` => Some(`[`, None)
+ /// `[0]` => None
+ /// `"foo"` => None
+ /// ```
+ fn peek_nested_delimited_identifier_quotes(
+ &self,
+ mut _chars: Peekable<Chars<'_>>,
+ ) -> Option<(char, Option<char>)> {
None
}
- /// Determine if quoted characters are proper for identifier
- fn is_proper_identifier_inside_quotes(&self, mut _chars:
Peekable<Chars<'_>>) -> bool {
- true
+ /// Return the character used to quote identifiers.
+ fn identifier_quote_style(&self, _identifier: &str) -> Option<char> {
+ None
}
/// Determine if a character is a valid start character for an unquoted
identifier
@@ -869,6 +894,17 @@ mod tests {
self.0.is_delimited_identifier_start(ch)
}
+ fn is_nested_delimited_identifier_start(&self, ch: char) -> bool {
+ self.0.is_nested_delimited_identifier_start(ch)
+ }
+
+ fn peek_nested_delimited_identifier_quotes(
+ &self,
+ chars: std::iter::Peekable<std::str::Chars<'_>>,
+ ) -> Option<(char, Option<char>)> {
+ self.0.peek_nested_delimited_identifier_quotes(chars)
+ }
+
fn identifier_quote_style(&self, identifier: &str) -> Option<char>
{
self.0.identifier_quote_style(identifier)
}
@@ -877,13 +913,6 @@ mod tests {
self.0.supports_string_literal_backslash_escape()
}
- fn is_proper_identifier_inside_quotes(
- &self,
- chars: std::iter::Peekable<std::str::Chars<'_>>,
- ) -> bool {
- self.0.is_proper_identifier_inside_quotes(chars)
- }
-
fn supports_filter_during_aggregation(&self) -> bool {
self.0.supports_filter_during_aggregation()
}
diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs
index 48eb00ab..55405ba5 100644
--- a/src/dialect/redshift.rs
+++ b/src/dialect/redshift.rs
@@ -32,21 +32,51 @@ pub struct RedshiftSqlDialect {}
// in the Postgres dialect, the query will be parsed as an array, while in the
Redshift dialect it will
// be a json path
impl Dialect for RedshiftSqlDialect {
- fn is_delimited_identifier_start(&self, ch: char) -> bool {
- ch == '"' || ch == '['
+ /// Determine if a character starts a potential nested quoted identifier.
+ /// Example: RedShift supports the following quote styles to all mean the
same thing:
+ /// ```sql
+ /// SELECT 1 AS foo;
+ /// SELECT 1 AS "foo";
+ /// SELECT 1 AS [foo];
+ /// SELECT 1 AS ["foo"];
+ /// ```
+ fn is_nested_delimited_identifier_start(&self, ch: char) -> bool {
+ ch == '['
}
- /// Determine if quoted characters are proper for identifier
- /// It's needed to distinguish treating square brackets as quotes from
- /// treating them as json path. If there is identifier then we assume
- /// there is no json path.
- fn is_proper_identifier_inside_quotes(&self, mut chars:
Peekable<Chars<'_>>) -> bool {
+ /// Only applicable whenever
[`Self::is_nested_delimited_identifier_start`] returns true
+ /// If the next sequence of tokens potentially represent a nested
identifier, then this method
+ /// returns a tuple containing the outer quote style, and if present, the
inner (nested) quote style.
+ ///
+ /// Example (Redshift):
+ /// ```text
+ /// `["foo"]` => Some(`[`, Some(`"`))
+ /// `[foo]` => Some(`[`, None)
+ /// `[0]` => None
+ /// `"foo"` => None
+ /// ```
+ fn peek_nested_delimited_identifier_quotes(
+ &self,
+ mut chars: Peekable<Chars<'_>>,
+ ) -> Option<(char, Option<char>)> {
+ if chars.peek() != Some(&'[') {
+ return None;
+ }
+
chars.next();
+
let mut not_white_chars = chars.skip_while(|ch|
ch.is_whitespace()).peekable();
+
if let Some(&ch) = not_white_chars.peek() {
- return self.is_identifier_start(ch);
+ if ch == '"' {
+ return Some(('[', Some('"')));
+ }
+ if self.is_identifier_start(ch) {
+ return Some(('[', None));
+ }
}
- false
+
+ None
}
fn is_identifier_start(&self, ch: char) -> bool {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index aacfc16f..9269f4fe 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1075,25 +1075,61 @@ impl<'a> Tokenizer<'a> {
Ok(Some(Token::DoubleQuotedString(s)))
}
// delimited (quoted) identifier
+ quote_start if self.dialect.is_delimited_identifier_start(ch)
=> {
+ let word = self.tokenize_quoted_identifier(quote_start,
chars)?;
+ Ok(Some(Token::make_word(&word, Some(quote_start))))
+ }
+ // Potentially nested delimited (quoted) identifier
quote_start
- if self.dialect.is_delimited_identifier_start(ch)
+ if self
+ .dialect
+ .is_nested_delimited_identifier_start(quote_start)
&& self
.dialect
-
.is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
+
.peek_nested_delimited_identifier_quotes(chars.peekable.clone())
+ .is_some() =>
{
- let error_loc = chars.location();
- chars.next(); // consume the opening quote
+ let Some((quote_start, nested_quote_start)) = self
+ .dialect
+
.peek_nested_delimited_identifier_quotes(chars.peekable.clone())
+ else {
+ return self.tokenizer_error(
+ chars.location(),
+ format!("Expected nested delimiter '{quote_start}'
before EOF."),
+ );
+ };
+
+ let Some(nested_quote_start) = nested_quote_start else {
+ let word =
self.tokenize_quoted_identifier(quote_start, chars)?;
+ return Ok(Some(Token::make_word(&word,
Some(quote_start))));
+ };
+
+ let mut word = vec![];
let quote_end = Word::matching_end_quote(quote_start);
- let (s, last_char) = self.parse_quoted_ident(chars,
quote_end);
+ let nested_quote_end =
Word::matching_end_quote(nested_quote_start);
+ let error_loc = chars.location();
- if last_char == Some(quote_end) {
- Ok(Some(Token::make_word(&s, Some(quote_start))))
- } else {
- self.tokenizer_error(
+ chars.next(); // skip the first delimiter
+ peeking_take_while(chars, |ch| ch.is_whitespace());
+ if chars.peek() != Some(&nested_quote_start) {
+ return self.tokenizer_error(
+ error_loc,
+ format!("Expected nested delimiter
'{nested_quote_start}' before EOF."),
+ );
+ }
+ word.push(nested_quote_start.into());
+
word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
+ word.push(nested_quote_end.into());
+ peeking_take_while(chars, |ch| ch.is_whitespace());
+ if chars.peek() != Some("e_end) {
+ return self.tokenizer_error(
error_loc,
format!("Expected close delimiter '{quote_end}'
before EOF."),
- )
+ );
}
+ chars.next(); // skip close delimiter
+
+ Ok(Some(Token::make_word(&word.concat(),
Some(quote_start))))
}
// numbers and period
'0'..='9' | '.' => {
@@ -1597,6 +1633,27 @@ impl<'a> Tokenizer<'a> {
s
}
+ /// Read a quoted identifier
+ fn tokenize_quoted_identifier(
+ &self,
+ quote_start: char,
+ chars: &mut State,
+ ) -> Result<String, TokenizerError> {
+ let error_loc = chars.location();
+ chars.next(); // consume the opening quote
+ let quote_end = Word::matching_end_quote(quote_start);
+ let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
+
+ if last_char == Some(quote_end) {
+ Ok(s)
+ } else {
+ self.tokenizer_error(
+ error_loc,
+ format!("Expected close delimiter '{quote_end}' before EOF."),
+ )
+ }
+ }
+
/// Read a single quoted string, starting with the opening quote.
fn tokenize_escaped_single_quoted_string(
&self,
diff --git a/tests/sqlparser_redshift.rs b/tests/sqlparser_redshift.rs
index 9492946d..857d378b 100644
--- a/tests/sqlparser_redshift.rs
+++ b/tests/sqlparser_redshift.rs
@@ -157,6 +157,8 @@ fn parse_delimited_identifiers() {
}
redshift().verified_stmt(r#"CREATE TABLE "foo" ("bar" "int")"#);
+ // An alias starting with a number
+ redshift().verified_stmt(r#"CREATE TABLE "foo" ("1" INT)"#);
redshift().verified_stmt(r#"ALTER TABLE foo ADD CONSTRAINT "bar" PRIMARY
KEY (baz)"#);
//TODO verified_stmt(r#"UPDATE foo SET "bar" = 5"#);
}
@@ -203,7 +205,7 @@ fn test_redshift_json_path() {
path: JsonPath {
path: vec![
JsonPathElem::Bracket {
- key: Expr::Value(Value::Number("0".parse().unwrap(),
false))
+ key: Expr::Value(number("0"))
},
JsonPathElem::Dot {
key: "o_orderkey".to_string(),
@@ -226,7 +228,7 @@ fn test_redshift_json_path() {
path: JsonPath {
path: vec![
JsonPathElem::Bracket {
- key: Expr::Value(Value::Number("0".parse().unwrap(),
false))
+ key: Expr::Value(number("0"))
},
JsonPathElem::Bracket {
key:
Expr::Value(Value::SingleQuotedString("id".to_owned()))
@@ -250,7 +252,7 @@ fn test_redshift_json_path() {
path: JsonPath {
path: vec![
JsonPathElem::Bracket {
- key: Expr::Value(Value::Number("0".parse().unwrap(),
false))
+ key: Expr::Value(number("0"))
},
JsonPathElem::Bracket {
key:
Expr::Value(Value::SingleQuotedString("id".to_owned()))
@@ -260,6 +262,31 @@ fn test_redshift_json_path() {
},
expr_from_projection(only(&select.projection))
);
+
+ let sql = r#"SELECT db1.sc1.tbl1.col1[0]."id" FROM
customer_orders_lineitem"#;
+ let select = dialects.verified_only_select(sql);
+ assert_eq!(
+ &Expr::JsonAccess {
+ value: Box::new(Expr::CompoundIdentifier(vec![
+ Ident::new("db1"),
+ Ident::new("sc1"),
+ Ident::new("tbl1"),
+ Ident::new("col1")
+ ])),
+ path: JsonPath {
+ path: vec![
+ JsonPathElem::Bracket {
+ key: Expr::Value(number("0"))
+ },
+ JsonPathElem::Dot {
+ key: "id".to_string(),
+ quoted: true,
+ }
+ ]
+ }
+ },
+ expr_from_projection(only(&select.projection))
+ );
}
#[test]
@@ -276,7 +303,7 @@ fn test_parse_json_path_from() {
&Some(JsonPath {
path: vec![
JsonPathElem::Bracket {
- key:
Expr::Value(Value::Number("0".parse().unwrap(), false))
+ key: Expr::Value(number("0"))
},
JsonPathElem::Dot {
key: "a".to_string(),
@@ -300,7 +327,7 @@ fn test_parse_json_path_from() {
&Some(JsonPath {
path: vec![
JsonPathElem::Bracket {
- key:
Expr::Value(Value::Number("0".parse().unwrap(), false))
+ key: Expr::Value(number("0"))
},
JsonPathElem::Dot {
key: "a".to_string(),
@@ -334,3 +361,24 @@ fn test_parse_json_path_from() {
_ => panic!(),
}
}
+
+#[test]
+fn test_parse_select_numbered_columns() {
+ // An alias starting with a number
+ redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1" FROM a"#);
+ redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1abc" FROM a"#);
+}
+
+#[test]
+fn test_parse_nested_quoted_identifier() {
+ redshift().verified_stmt(r#"SELECT 1 AS ["1"] FROM a"#);
+ redshift().verified_stmt(r#"SELECT 1 AS ["[="] FROM a"#);
+ redshift().verified_stmt(r#"SELECT 1 AS ["=]"] FROM a"#);
+ redshift().verified_stmt(r#"SELECT 1 AS ["a[b]"] FROM a"#);
+ // trim spaces
+ redshift().one_statement_parses_to(r#"SELECT 1 AS [ " 1 " ]"#, r#"SELECT 1
AS [" 1 "]"#);
+ // invalid query
+ assert!(redshift()
+ .parse_sql_statements(r#"SELECT 1 AS ["1]"#)
+ .is_err());
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]