Re: [PR] Fallback to identifier parsing if expression parsing fails [datafusion-sqlparser-rs]

via GitHub Thu, 14 Nov 2024 09:52:59 -0800


iffyio commented on code in PR #1513:
URL: 
https://github.com/apache/datafusion-sqlparser-rs/pull/1513#discussion_r1842629566



##########
src/parser/mod.rs:
##########
@@ -1013,175 +1189,22 @@ impl<'a> Parser<'a> {
 
         let next_token = self.next_token();
         let expr = match next_token.token {
-            Token::Word(w) => match w.keyword {
-                Keyword::TRUE | Keyword::FALSE if 
self.dialect.supports_boolean_literals() => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::NULL => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::CURRENT_CATALOG
-                | Keyword::CURRENT_USER
-                | Keyword::SESSION_USER
-                | Keyword::USER
-                    if dialect_of!(self is PostgreSqlDialect | GenericDialect) 
=>
-                {
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::None,
-                        null_treatment: None,
-                        filter: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::CURRENT_TIMESTAMP
-                | Keyword::CURRENT_TIME
-                | Keyword::CURRENT_DATE
-                | Keyword::LOCALTIME
-                | Keyword::LOCALTIMESTAMP => {
-                    self.parse_time_functions(ObjectName(vec![w.to_ident()]))
-                }
-                Keyword::CASE => self.parse_case_expr(),
-                Keyword::CONVERT => self.parse_convert_expr(false),
-                Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => 
self.parse_convert_expr(true),
-                Keyword::CAST => self.parse_cast_expr(CastKind::Cast),
-                Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast),
-                Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast),
-                Keyword::EXISTS
-                    // Support parsing Databricks has a function named 
`exists`.
-                    if !dialect_of!(self is DatabricksDialect)
-                        || matches!(
-                            self.peek_nth_token(1).token,
-                            Token::Word(Word {
-                                keyword: Keyword::SELECT | Keyword::WITH,
-                                ..
-                            })
-                        ) =>
-                {
-                    self.parse_exists_expr(false)
-                }
-                Keyword::EXTRACT => self.parse_extract_expr(),
-                Keyword::CEIL => self.parse_ceil_floor_expr(true),
-                Keyword::FLOOR => self.parse_ceil_floor_expr(false),
-                Keyword::POSITION if self.peek_token().token == Token::LParen 
=> {
-                    self.parse_position_expr(w.to_ident())
-                }
-                Keyword::SUBSTRING => self.parse_substring_expr(),
-                Keyword::OVERLAY => self.parse_overlay_expr(),
-                Keyword::TRIM => self.parse_trim_expr(),
-                Keyword::INTERVAL => self.parse_interval(),
-                // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as 
subquery or a function call
-                Keyword::ARRAY if self.peek_token() == Token::LBracket => {
-                    self.expect_token(&Token::LBracket)?;
-                    self.parse_array_expr(true)
-                }
-                Keyword::ARRAY
-                    if self.peek_token() == Token::LParen
-                        && !dialect_of!(self is ClickHouseDialect | 
DatabricksDialect) =>
-                {
-                    self.expect_token(&Token::LParen)?;
-                    let query = self.parse_query()?;
-                    self.expect_token(&Token::RParen)?;
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::Subquery(query),
-                        filter: None,
-                        null_treatment: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::NOT => self.parse_not(),
-                Keyword::MATCH if dialect_of!(self is MySqlDialect | 
GenericDialect) => {
-                    self.parse_match_against()
-                }
-                Keyword::STRUCT if dialect_of!(self is BigQueryDialect | 
GenericDialect) => {
-                    self.prev_token();
-                    self.parse_bigquery_struct_literal()
-                }
-                Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) 
=> {
-                    let expr = 
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?;
-                    Ok(Expr::Prior(Box::new(expr)))
-                }
-                Keyword::MAP if self.peek_token() == Token::LBrace && 
self.dialect.support_map_literal_syntax() => {
-                    self.parse_duckdb_map_literal()
-                }
-                // Here `w` is a word, check if it's a part of a multipart
-                // identifier, a function call, or a simple identifier:
-                _ => match self.peek_token().token {
-                    Token::LParen | Token::Period => {
-                        let mut id_parts: Vec<Ident> = vec![w.to_ident()];
-                        let mut ends_with_wildcard = false;
-                        while self.consume_token(&Token::Period) {
-                            let next_token = self.next_token();
-                            match next_token.token {
-                                Token::Word(w) => id_parts.push(w.to_ident()),
-                                Token::Mul => {
-                                    // Postgres explicitly allows 
funcnm(tablenm.*) and the
-                                    // function array_agg traverses this 
control flow
-                                    if dialect_of!(self is PostgreSqlDialect) {
-                                        ends_with_wildcard = true;
-                                        break;
-                                    } else {
-                                        return self
-                                            .expected("an identifier after 
'.'", next_token);
-                                    }
-                                }
-                                Token::SingleQuotedString(s) => {
-                                    id_parts.push(Ident::with_quote('\'', s))
-                                }
-                                _ => {
-                                    return self
-                                        .expected("an identifier or a '*' 
after '.'", next_token);
-                                }
-                            }
+            // We first try to parse the word as the prefix of an expression.
+            // For example, the word INTERVAL in: SELECT INTERVAL '7' DAY
+            Token::Word(w) => match self.try_parse(|parser| 
parser.parse_expr_by_keyword(&w)) {

Review Comment:
   Is it possible to use `self.maybe_parse` instead for the optional expr 
parsing?



##########
tests/sqlparser_common.rs:
##########
@@ -5071,7 +5071,9 @@ fn parse_interval_dont_require_unit() {
 
 #[test]
 fn parse_interval_require_unit() {
-    let dialects = all_dialects_where(|d| d.require_interval_qualifier());
+    let dialects = all_dialects_where(|d| {
+        d.require_interval_qualifier() && 
d.is_reserved_for_identifier(Keyword::INTERVAL)

Review Comment:
   oh I would have expected that these the reserved condition is always true 
given the require_interval condition is true? in that, would it be correct for 
a dialect to not have INTERVAL as a keyword yet 
`d.require_interval_qualifier()` returns true for that dialect



##########
src/parser/mod.rs:
##########
@@ -3574,6 +3597,24 @@ impl<'a> Parser<'a> {
         }
     }
 
+    /// Run a parser method `f`, reverting back to the current position if 
unsuccessful
+    /// but retaining the error message if such was raised by `f`
+    pub fn try_parse<T, F>(&mut self, mut f: F) -> Result<T, ParserError>

Review Comment:
   Ah I left a similar comment earlier, I'm suspecting I'm missing something, 
not sure I see the need for a custom try_parse vs maybe_parse.



##########
src/ast/mod.rs:
##########
@@ -695,6 +695,8 @@ pub enum Expr {
         // 
https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#formatting_syntax
         format: Option<CastFormat>,
     },
+    /// `DEFAULT` value of a column e.g. INSERT INTO tbl (a, b) VALUES ('foo', 
DEFAULT)
+    Default,

Review Comment:
   is default a reserved keyword in all dialects/sql-standard? wondering if its 
not this might break behavior for some folks



##########
src/parser/mod.rs:
##########
@@ -965,6 +969,178 @@ impl<'a> Parser<'a> {
         Ok(Statement::NOTIFY { channel, payload })
     }
 
+    fn parse_expr_by_keyword(&mut self, w: &Word) -> Result<Expr, ParserError> 
{
+        match w.keyword {
+            Keyword::TRUE | Keyword::FALSE if 
self.dialect.supports_boolean_literals() => {
+                self.prev_token();
+                Ok(Expr::Value(self.parse_value()?))
+            }
+            Keyword::NULL => {
+                self.prev_token();
+                Ok(Expr::Value(self.parse_value()?))
+            }
+            Keyword::CURRENT_CATALOG
+            | Keyword::CURRENT_USER
+            | Keyword::SESSION_USER
+            | Keyword::USER
+                if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
+            {
+                Ok(Expr::Function(Function {
+                    name: ObjectName(vec![w.to_ident()]),
+                    parameters: FunctionArguments::None,
+                    args: FunctionArguments::None,
+                    null_treatment: None,
+                    filter: None,
+                    over: None,
+                    within_group: vec![],
+                }))
+            }
+            Keyword::CURRENT_TIMESTAMP
+            | Keyword::CURRENT_TIME
+            | Keyword::CURRENT_DATE
+            | Keyword::LOCALTIME
+            | Keyword::LOCALTIMESTAMP => {
+                self.parse_time_functions(ObjectName(vec![w.to_ident()]))
+            }
+            Keyword::CASE => self.parse_case_expr(),
+            Keyword::CONVERT => self.parse_convert_expr(false),
+            Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => 
self.parse_convert_expr(true),
+            Keyword::CAST => self.parse_cast_expr(CastKind::Cast),
+            Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast),
+            Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast),
+            Keyword::EXISTS
+                // Support parsing Databricks has a function named `exists`.
+                if !dialect_of!(self is DatabricksDialect)
+                    || matches!(
+                        self.peek_nth_token(1).token,
+                        Token::Word(Word {
+                            keyword: Keyword::SELECT | Keyword::WITH,
+                            ..
+                        })
+                    ) =>
+            {
+                self.parse_exists_expr(false)
+            }
+            Keyword::EXTRACT => self.parse_extract_expr(),
+            Keyword::CEIL => self.parse_ceil_floor_expr(true),
+            Keyword::FLOOR => self.parse_ceil_floor_expr(false),
+            Keyword::POSITION if self.peek_token().token == Token::LParen => {
+                self.parse_position_expr(w.to_ident())
+            }
+            Keyword::SUBSTRING => self.parse_substring_expr(),
+            Keyword::OVERLAY => self.parse_overlay_expr(),
+            Keyword::TRIM => self.parse_trim_expr(),
+            Keyword::INTERVAL => self.parse_interval(),
+            // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as 
subquery or a function call
+            Keyword::ARRAY if self.peek_token() == Token::LBracket => {
+                self.expect_token(&Token::LBracket)?;
+                self.parse_array_expr(true)
+            }
+            Keyword::ARRAY
+                if self.peek_token() == Token::LParen
+                    && !dialect_of!(self is ClickHouseDialect | 
DatabricksDialect) =>
+            {
+                self.expect_token(&Token::LParen)?;
+                let query = self.parse_query()?;
+                self.expect_token(&Token::RParen)?;
+                Ok(Expr::Function(Function {
+                    name: ObjectName(vec![w.to_ident()]),
+                    parameters: FunctionArguments::None,
+                    args: FunctionArguments::Subquery(query),
+                    filter: None,
+                    null_treatment: None,
+                    over: None,
+                    within_group: vec![],
+                }))
+            }
+            Keyword::NOT => self.parse_not(),
+            Keyword::MATCH if dialect_of!(self is MySqlDialect | 
GenericDialect) => {
+                self.parse_match_against()
+            }
+            Keyword::STRUCT if dialect_of!(self is BigQueryDialect | 
GenericDialect) => {
+                self.prev_token();
+                self.parse_bigquery_struct_literal()
+            }
+            Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) => {
+                let expr = 
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?;
+                Ok(Expr::Prior(Box::new(expr)))
+            }
+            Keyword::MAP if self.peek_token() == Token::LBrace && 
self.dialect.support_map_literal_syntax() => {
+                self.parse_duckdb_map_literal()
+            }
+            Keyword::DEFAULT => Ok(Expr::Default),
+            _ => Err(ParserError::BranchAbandoned)
+        }
+    }
+
+    fn parse_ident_expr(&mut self, w: &Word) -> Result<Expr, ParserError> {

Review Comment:
   hmm thinking we would need to rename the function, it seems to be doing more 
than identifiers (and based on the name its unclear what the word `w` input 
argument is for)



##########
src/parser/mod.rs:
##########
@@ -1013,175 +1189,22 @@ impl<'a> Parser<'a> {
 
         let next_token = self.next_token();
         let expr = match next_token.token {
-            Token::Word(w) => match w.keyword {
-                Keyword::TRUE | Keyword::FALSE if 
self.dialect.supports_boolean_literals() => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::NULL => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::CURRENT_CATALOG
-                | Keyword::CURRENT_USER
-                | Keyword::SESSION_USER
-                | Keyword::USER
-                    if dialect_of!(self is PostgreSqlDialect | GenericDialect) 
=>
-                {
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::None,
-                        null_treatment: None,
-                        filter: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::CURRENT_TIMESTAMP
-                | Keyword::CURRENT_TIME
-                | Keyword::CURRENT_DATE
-                | Keyword::LOCALTIME
-                | Keyword::LOCALTIMESTAMP => {
-                    self.parse_time_functions(ObjectName(vec![w.to_ident()]))
-                }
-                Keyword::CASE => self.parse_case_expr(),
-                Keyword::CONVERT => self.parse_convert_expr(false),
-                Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => 
self.parse_convert_expr(true),
-                Keyword::CAST => self.parse_cast_expr(CastKind::Cast),
-                Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast),
-                Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast),
-                Keyword::EXISTS
-                    // Support parsing Databricks has a function named 
`exists`.
-                    if !dialect_of!(self is DatabricksDialect)
-                        || matches!(
-                            self.peek_nth_token(1).token,
-                            Token::Word(Word {
-                                keyword: Keyword::SELECT | Keyword::WITH,
-                                ..
-                            })
-                        ) =>
-                {
-                    self.parse_exists_expr(false)
-                }
-                Keyword::EXTRACT => self.parse_extract_expr(),
-                Keyword::CEIL => self.parse_ceil_floor_expr(true),
-                Keyword::FLOOR => self.parse_ceil_floor_expr(false),
-                Keyword::POSITION if self.peek_token().token == Token::LParen 
=> {
-                    self.parse_position_expr(w.to_ident())
-                }
-                Keyword::SUBSTRING => self.parse_substring_expr(),
-                Keyword::OVERLAY => self.parse_overlay_expr(),
-                Keyword::TRIM => self.parse_trim_expr(),
-                Keyword::INTERVAL => self.parse_interval(),
-                // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as 
subquery or a function call
-                Keyword::ARRAY if self.peek_token() == Token::LBracket => {
-                    self.expect_token(&Token::LBracket)?;
-                    self.parse_array_expr(true)
-                }
-                Keyword::ARRAY
-                    if self.peek_token() == Token::LParen
-                        && !dialect_of!(self is ClickHouseDialect | 
DatabricksDialect) =>
-                {
-                    self.expect_token(&Token::LParen)?;
-                    let query = self.parse_query()?;
-                    self.expect_token(&Token::RParen)?;
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::Subquery(query),
-                        filter: None,
-                        null_treatment: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::NOT => self.parse_not(),
-                Keyword::MATCH if dialect_of!(self is MySqlDialect | 
GenericDialect) => {
-                    self.parse_match_against()
-                }
-                Keyword::STRUCT if dialect_of!(self is BigQueryDialect | 
GenericDialect) => {
-                    self.prev_token();
-                    self.parse_bigquery_struct_literal()
-                }
-                Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) 
=> {
-                    let expr = 
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?;
-                    Ok(Expr::Prior(Box::new(expr)))
-                }
-                Keyword::MAP if self.peek_token() == Token::LBrace && 
self.dialect.support_map_literal_syntax() => {
-                    self.parse_duckdb_map_literal()
-                }
-                // Here `w` is a word, check if it's a part of a multipart
-                // identifier, a function call, or a simple identifier:
-                _ => match self.peek_token().token {
-                    Token::LParen | Token::Period => {
-                        let mut id_parts: Vec<Ident> = vec![w.to_ident()];
-                        let mut ends_with_wildcard = false;
-                        while self.consume_token(&Token::Period) {
-                            let next_token = self.next_token();
-                            match next_token.token {
-                                Token::Word(w) => id_parts.push(w.to_ident()),
-                                Token::Mul => {
-                                    // Postgres explicitly allows 
funcnm(tablenm.*) and the
-                                    // function array_agg traverses this 
control flow
-                                    if dialect_of!(self is PostgreSqlDialect) {
-                                        ends_with_wildcard = true;
-                                        break;
-                                    } else {
-                                        return self
-                                            .expected("an identifier after 
'.'", next_token);
-                                    }
-                                }
-                                Token::SingleQuotedString(s) => {
-                                    id_parts.push(Ident::with_quote('\'', s))
-                                }
-                                _ => {
-                                    return self
-                                        .expected("an identifier or a '*' 
after '.'", next_token);
-                                }
-                            }
+            // We first try to parse the word as the prefix of an expression.
+            // For example, the word INTERVAL in: SELECT INTERVAL '7' DAY
+            Token::Word(w) => match self.try_parse(|parser| 
parser.parse_expr_by_keyword(&w)) {
+                Ok(expr) => Ok(expr),
+                // Word does not indicate the start of a complex expression, 
try to parse as identifier
+                Err(ParserError::BranchAbandoned) => 
Ok(self.parse_ident_expr(&w)?),
+                // Word indicates the start of a complex expression, try to 
parse as identifier if the
+                // dialect does not reserve it, otherwise return the original 
error
+                Err(e) => {
+                    if !self.dialect.is_reserved_for_identifier(w.keyword) {

Review Comment:
   one thing I'm wondering regarding the original problem is whether the 
underlying issue is rather the keywords like interval not being properly gated 
to only the dialects that support them?
   I suspect there are cases that could benefit from not having the parser 
mistaking keywords on dialects that don't support them though (like in this 
`self.dialect.is_reserved_for_identifier`) - but not super sure if there are 
cases in this scenario with exprs here that strictly require it



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Fallback to identifier parsing if expression parsing fails [datafusion-sqlparser-rs]

Reply via email to