Re: [PR] Fallback to identifier parsing if expression parsing fails [datafusion-sqlparser-rs]

via GitHub Wed, 20 Nov 2024 08:45:16 -0800


iffyio commented on code in PR #1513:
URL: 
https://github.com/apache/datafusion-sqlparser-rs/pull/1513#discussion_r1850627576



##########
src/parser/mod.rs:
##########
@@ -1009,6 +1009,183 @@ impl<'a> Parser<'a> {
         Ok(Statement::NOTIFY { channel, payload })
     }
 
+    // Tries to parse an expression by matching the specified word to known 
keywords that have a special meaning in the dialect.
+    // Returns `None if no match is found.
+    fn parse_expr_prefix_by_reserved_word(
+        &mut self,
+        w: &Word,
+    ) -> Result<Option<Expr>, ParserError> {
+        match w.keyword {
+            Keyword::TRUE | Keyword::FALSE if 
self.dialect.supports_boolean_literals() => {
+                self.prev_token();
+                Ok(Some(Expr::Value(self.parse_value()?)))
+            }
+            Keyword::NULL => {
+                self.prev_token();
+                Ok(Some(Expr::Value(self.parse_value()?)))
+            }
+            Keyword::CURRENT_CATALOG
+            | Keyword::CURRENT_USER
+            | Keyword::SESSION_USER
+            | Keyword::USER
+                if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
+            {
+                Ok(Some(Expr::Function(Function {
+                    name: ObjectName(vec![w.to_ident()]),
+                    parameters: FunctionArguments::None,
+                    args: FunctionArguments::None,
+                    null_treatment: None,
+                    filter: None,
+                    over: None,
+                    within_group: vec![],
+                })))
+            }
+            Keyword::CURRENT_TIMESTAMP
+            | Keyword::CURRENT_TIME
+            | Keyword::CURRENT_DATE
+            | Keyword::LOCALTIME
+            | Keyword::LOCALTIMESTAMP => {
+                
Ok(Some(self.parse_time_functions(ObjectName(vec![w.to_ident()]))?))
+            }
+            Keyword::CASE => Ok(Some(self.parse_case_expr()?)),
+            Keyword::CONVERT => Ok(Some(self.parse_convert_expr(false)?)),
+            Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => 
Ok(Some(self.parse_convert_expr(true)?)),
+            Keyword::CAST => Ok(Some(self.parse_cast_expr(CastKind::Cast)?)),
+            Keyword::TRY_CAST => 
Ok(Some(self.parse_cast_expr(CastKind::TryCast)?)),
+            Keyword::SAFE_CAST => 
Ok(Some(self.parse_cast_expr(CastKind::SafeCast)?)),
+            Keyword::EXISTS
+                // Support parsing Databricks has a function named `exists`.
+                if !dialect_of!(self is DatabricksDialect)
+                    || matches!(
+                        self.peek_nth_token(1).token,
+                        Token::Word(Word {
+                            keyword: Keyword::SELECT | Keyword::WITH,
+                            ..
+                        })
+                    ) =>
+            {
+                Ok(Some(self.parse_exists_expr(false)?))
+            }
+            Keyword::EXTRACT => Ok(Some(self.parse_extract_expr()?)),
+            Keyword::CEIL => Ok(Some(self.parse_ceil_floor_expr(true)?)),
+            Keyword::FLOOR => Ok(Some(self.parse_ceil_floor_expr(false)?)),
+            Keyword::POSITION if self.peek_token().token == Token::LParen => {
+                Ok(Some(self.parse_position_expr(w.to_ident())?))
+            }
+            Keyword::SUBSTRING => Ok(Some(self.parse_substring_expr()?)),
+            Keyword::OVERLAY => Ok(Some(self.parse_overlay_expr()?)),
+            Keyword::TRIM => Ok(Some(self.parse_trim_expr()?)),
+            Keyword::INTERVAL => Ok(Some(self.parse_interval()?)),
+            // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as 
subquery or a function call
+            Keyword::ARRAY if self.peek_token() == Token::LBracket => {
+                self.expect_token(&Token::LBracket)?;
+                Ok(Some(self.parse_array_expr(true)?))
+            }
+            Keyword::ARRAY
+                if self.peek_token() == Token::LParen
+                    && !dialect_of!(self is ClickHouseDialect | 
DatabricksDialect) =>
+            {
+                self.expect_token(&Token::LParen)?;
+                let query = self.parse_query()?;
+                self.expect_token(&Token::RParen)?;
+                Ok(Some(Expr::Function(Function {
+                    name: ObjectName(vec![w.to_ident()]),
+                    parameters: FunctionArguments::None,
+                    args: FunctionArguments::Subquery(query),
+                    filter: None,
+                    null_treatment: None,
+                    over: None,
+                    within_group: vec![],
+                })))
+            }
+            Keyword::NOT => Ok(Some(self.parse_not()?)),
+            Keyword::MATCH if dialect_of!(self is MySqlDialect | 
GenericDialect) => {
+                Ok(Some(self.parse_match_against()?))
+            }
+            Keyword::STRUCT if dialect_of!(self is BigQueryDialect | 
GenericDialect) => {
+                self.prev_token();
+                Ok(Some(self.parse_bigquery_struct_literal()?))
+            }
+            Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) => {
+                let expr = 
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?;
+                Ok(Some(Expr::Prior(Box::new(expr))))
+            }
+            Keyword::MAP if self.peek_token() == Token::LBrace && 
self.dialect.support_map_literal_syntax() => {
+                Ok(Some(self.parse_duckdb_map_literal()?))
+            }
+            _ => Ok(None)
+        }
+    }
+
+    // Tries to parse an expression by a word that is not known to have a 
special meaning in the dialect.
+    fn parse_expr_prefix_by_unnreserved_word(&mut self, w: &Word) -> 
Result<Expr, ParserError> {

Review Comment:
   ```suggestion
       fn parse_expr_prefix_by_unreserved_word(&mut self, w: &Word) -> 
Result<Expr, ParserError> {
   ```



##########
src/parser/mod.rs:
##########
@@ -1057,176 +1234,258 @@ impl<'a> Parser<'a> {
 
         let next_token = self.next_token();
         let expr = match next_token.token {
-            Token::Word(w) => match w.keyword {
-                Keyword::TRUE | Keyword::FALSE if 
self.dialect.supports_boolean_literals() => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::NULL => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::CURRENT_CATALOG
-                | Keyword::CURRENT_USER
-                | Keyword::SESSION_USER
-                | Keyword::USER
-                    if dialect_of!(self is PostgreSqlDialect | GenericDialect) 
=>
-                {
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::None,
-                        null_treatment: None,
-                        filter: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::CURRENT_TIMESTAMP
-                | Keyword::CURRENT_TIME
-                | Keyword::CURRENT_DATE
-                | Keyword::LOCALTIME
-                | Keyword::LOCALTIMESTAMP => {
-                    self.parse_time_functions(ObjectName(vec![w.to_ident()]))
-                }
-                Keyword::CASE => self.parse_case_expr(),
-                Keyword::CONVERT => self.parse_convert_expr(false),
-                Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => 
self.parse_convert_expr(true),
-                Keyword::CAST => self.parse_cast_expr(CastKind::Cast),
-                Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast),
-                Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast),
-                Keyword::EXISTS
-                    // Support parsing Databricks has a function named 
`exists`.
-                    if !dialect_of!(self is DatabricksDialect)
-                        || matches!(
-                            self.peek_nth_token(1).token,
-                            Token::Word(Word {
-                                keyword: Keyword::SELECT | Keyword::WITH,
-                                ..
-                            })
-                        ) =>
-                {
-                    self.parse_exists_expr(false)
-                }
-                Keyword::EXTRACT => self.parse_extract_expr(),
-                Keyword::CEIL => self.parse_ceil_floor_expr(true),
-                Keyword::FLOOR => self.parse_ceil_floor_expr(false),
-                Keyword::POSITION if self.peek_token().token == Token::LParen 
=> {
-                    self.parse_position_expr(w.to_ident())
-                }
-                Keyword::SUBSTRING => self.parse_substring_expr(),
-                Keyword::OVERLAY => self.parse_overlay_expr(),
-                Keyword::TRIM => self.parse_trim_expr(),
-                Keyword::INTERVAL => self.parse_interval(),
-                // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as 
subquery or a function call
-                Keyword::ARRAY if self.peek_token() == Token::LBracket => {
-                    self.expect_token(&Token::LBracket)?;
-                    self.parse_array_expr(true)
-                }
-                Keyword::ARRAY
-                    if self.peek_token() == Token::LParen
-                        && !dialect_of!(self is ClickHouseDialect | 
DatabricksDialect) =>
+            Token::Word(w) => {
+                // The word we consumed may fall into one of two cases: it has 
a special meaning, or not.
+                // For example, in Snowflake, the word `interval` may have two 
meanings depending on the context:
+                // `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) 
FROM tbl;`
+                //                          ^^^^^^^^^^^^^^^^      ^^^^^^^^
+                //                         interval expression   identifier
+                //
+                // We first try to parse the word and following tokens as a 
special expression, and if that fails,
+                // we rollback and try to parse it as an identifier.
+                match self
+                    .maybe_parse_internal(|parser| 
parser.parse_expr_prefix_by_reserved_word(&w))
                 {
-                    self.expect_token(&Token::LParen)?;
-                    let query = self.parse_query()?;
-                    self.expect_token(&Token::RParen)?;
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::Subquery(query),
-                        filter: None,
-                        null_treatment: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::NOT => self.parse_not(),
-                Keyword::MATCH if dialect_of!(self is MySqlDialect | 
GenericDialect) => {
-                    self.parse_match_against()
-                }
-                Keyword::STRUCT if dialect_of!(self is BigQueryDialect | 
GenericDialect) => {
-                    self.prev_token();
-                    self.parse_bigquery_struct_literal()
-                }
-                Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) 
=> {
-                    let expr = 
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?;
-                    Ok(Expr::Prior(Box::new(expr)))
-                }
-                Keyword::MAP if self.peek_token() == Token::LBrace && 
self.dialect.support_map_literal_syntax() => {
-                    self.parse_duckdb_map_literal()
-                }
-                // Here `w` is a word, check if it's a part of a multipart
-                // identifier, a function call, or a simple identifier:
-                _ => match self.peek_token().token {
-                    Token::LParen | Token::Period => {
-                        let mut id_parts: Vec<Ident> = vec![w.to_ident()];
-                        let mut ends_with_wildcard = false;
-                        while self.consume_token(&Token::Period) {
-                            let next_token = self.next_token();
-                            match next_token.token {
-                                Token::Word(w) => id_parts.push(w.to_ident()),
-                                Token::Mul => {
-                                    // Postgres explicitly allows 
funcnm(tablenm.*) and the
-                                    // function array_agg traverses this 
control flow
-                                    if dialect_of!(self is PostgreSqlDialect) {
-                                        ends_with_wildcard = true;
-                                        break;
-                                    } else {
-                                        return self
-                                            .expected("an identifier after 
'.'", next_token);
-                                    }
-                                }
-                                Token::SingleQuotedString(s) => {
-                                    id_parts.push(Ident::with_quote('\'', s))
-                                }
-                                _ => {
-                                    return self
-                                        .expected("an identifier or a '*' 
after '.'", next_token);
-                                }
+                    // This word indicated an expression prefix and parsing 
was successful
+                    Ok(Some(expr)) => Ok(expr),
+
+                    // No expression prefix associated with this word
+                    Ok(None) => 
Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?),
+
+                    // If parsing of the word as a special expression failed, 
we are facing two options:
+                    // 1. The statement is malformed, e.g. `SELECT INTERVAL '1 
DAI`

Review Comment:
   ```suggestion
                       // 1. The statement is malformed, e.g. `SELECT INTERVAL 
'1 DAY`
   ```



##########
src/dialect/snowflake.rs:
##########
@@ -214,6 +216,16 @@ impl Dialect for SnowflakeDialect {
     fn supports_show_like_before_in(&self) -> bool {
         true
     }
+
+    fn is_reserved_for_identifier(&self, kw: Keyword) -> bool {

Review Comment:
   oh are we missing tests for the new behavior (I can't seem to find them in 
the current PR if so)?



##########
src/parser/mod.rs:
##########
@@ -3641,18 +3900,30 @@ impl<'a> Parser<'a> {
     }
 
     /// Run a parser method `f`, reverting back to the current position if 
unsuccessful.
-    pub fn maybe_parse<T, F>(&mut self, mut f: F) -> Result<Option<T>, 
ParserError>
+    /// Returns `None` if `f` returns an error
+    pub fn maybe_parse<T, F>(&mut self, f: F) -> Result<Option<T>, ParserError>
     where
         F: FnMut(&mut Parser) -> Result<T, ParserError>,
     {
-        let index = self.index;
-        match f(self) {
+        match self.maybe_parse_internal(f) {
             Ok(t) => Ok(Some(t)),
-            // Unwind stack if limit exceeded
             Err(ParserError::RecursionLimitExceeded) => 
Err(ParserError::RecursionLimitExceeded),
-            Err(_) => {
+            _ => Ok(None),
+        }
+    }
+
+    /// Run a parser method `f`, reverting back to the current position if 
unsuccessful.
+    pub fn maybe_parse_internal<T, F>(&mut self, mut f: F) -> Result<T, 
ParserError>

Review Comment:
   ```suggestion
       pub fn try_parse<T, F>(&mut self, mut f: F) -> Result<T, ParserError>
   ```
   We can probably call try_parse which could hint that it returns an error (vs 
maybe_parse which hints at an option)



##########
src/parser/mod.rs:
##########
@@ -1057,176 +1234,258 @@ impl<'a> Parser<'a> {
 
         let next_token = self.next_token();
         let expr = match next_token.token {
-            Token::Word(w) => match w.keyword {
-                Keyword::TRUE | Keyword::FALSE if 
self.dialect.supports_boolean_literals() => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::NULL => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::CURRENT_CATALOG
-                | Keyword::CURRENT_USER
-                | Keyword::SESSION_USER
-                | Keyword::USER
-                    if dialect_of!(self is PostgreSqlDialect | GenericDialect) 
=>
-                {
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::None,
-                        null_treatment: None,
-                        filter: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::CURRENT_TIMESTAMP
-                | Keyword::CURRENT_TIME
-                | Keyword::CURRENT_DATE
-                | Keyword::LOCALTIME
-                | Keyword::LOCALTIMESTAMP => {
-                    self.parse_time_functions(ObjectName(vec![w.to_ident()]))
-                }
-                Keyword::CASE => self.parse_case_expr(),
-                Keyword::CONVERT => self.parse_convert_expr(false),
-                Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => 
self.parse_convert_expr(true),
-                Keyword::CAST => self.parse_cast_expr(CastKind::Cast),
-                Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast),
-                Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast),
-                Keyword::EXISTS
-                    // Support parsing Databricks has a function named 
`exists`.
-                    if !dialect_of!(self is DatabricksDialect)
-                        || matches!(
-                            self.peek_nth_token(1).token,
-                            Token::Word(Word {
-                                keyword: Keyword::SELECT | Keyword::WITH,
-                                ..
-                            })
-                        ) =>
-                {
-                    self.parse_exists_expr(false)
-                }
-                Keyword::EXTRACT => self.parse_extract_expr(),
-                Keyword::CEIL => self.parse_ceil_floor_expr(true),
-                Keyword::FLOOR => self.parse_ceil_floor_expr(false),
-                Keyword::POSITION if self.peek_token().token == Token::LParen 
=> {
-                    self.parse_position_expr(w.to_ident())
-                }
-                Keyword::SUBSTRING => self.parse_substring_expr(),
-                Keyword::OVERLAY => self.parse_overlay_expr(),
-                Keyword::TRIM => self.parse_trim_expr(),
-                Keyword::INTERVAL => self.parse_interval(),
-                // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as 
subquery or a function call
-                Keyword::ARRAY if self.peek_token() == Token::LBracket => {
-                    self.expect_token(&Token::LBracket)?;
-                    self.parse_array_expr(true)
-                }
-                Keyword::ARRAY
-                    if self.peek_token() == Token::LParen
-                        && !dialect_of!(self is ClickHouseDialect | 
DatabricksDialect) =>
+            Token::Word(w) => {
+                // The word we consumed may fall into one of two cases: it has 
a special meaning, or not.
+                // For example, in Snowflake, the word `interval` may have two 
meanings depending on the context:
+                // `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) 
FROM tbl;`
+                //                          ^^^^^^^^^^^^^^^^      ^^^^^^^^
+                //                         interval expression   identifier
+                //
+                // We first try to parse the word and following tokens as a 
special expression, and if that fails,
+                // we rollback and try to parse it as an identifier.
+                match self
+                    .maybe_parse_internal(|parser| 
parser.parse_expr_prefix_by_reserved_word(&w))
                 {
-                    self.expect_token(&Token::LParen)?;
-                    let query = self.parse_query()?;
-                    self.expect_token(&Token::RParen)?;
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::Subquery(query),
-                        filter: None,
-                        null_treatment: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::NOT => self.parse_not(),
-                Keyword::MATCH if dialect_of!(self is MySqlDialect | 
GenericDialect) => {
-                    self.parse_match_against()
-                }
-                Keyword::STRUCT if dialect_of!(self is BigQueryDialect | 
GenericDialect) => {
-                    self.prev_token();
-                    self.parse_bigquery_struct_literal()
-                }
-                Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) 
=> {
-                    let expr = 
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?;
-                    Ok(Expr::Prior(Box::new(expr)))
-                }
-                Keyword::MAP if self.peek_token() == Token::LBrace && 
self.dialect.support_map_literal_syntax() => {
-                    self.parse_duckdb_map_literal()
-                }
-                // Here `w` is a word, check if it's a part of a multipart
-                // identifier, a function call, or a simple identifier:
-                _ => match self.peek_token().token {
-                    Token::LParen | Token::Period => {
-                        let mut id_parts: Vec<Ident> = vec![w.to_ident()];
-                        let mut ends_with_wildcard = false;
-                        while self.consume_token(&Token::Period) {
-                            let next_token = self.next_token();
-                            match next_token.token {
-                                Token::Word(w) => id_parts.push(w.to_ident()),
-                                Token::Mul => {
-                                    // Postgres explicitly allows 
funcnm(tablenm.*) and the
-                                    // function array_agg traverses this 
control flow
-                                    if dialect_of!(self is PostgreSqlDialect) {
-                                        ends_with_wildcard = true;
-                                        break;
-                                    } else {
-                                        return self
-                                            .expected("an identifier after 
'.'", next_token);
-                                    }
-                                }
-                                Token::SingleQuotedString(s) => {
-                                    id_parts.push(Ident::with_quote('\'', s))
-                                }
-                                _ => {
-                                    return self
-                                        .expected("an identifier or a '*' 
after '.'", next_token);
-                                }
+                    // This word indicated an expression prefix and parsing 
was successful
+                    Ok(Some(expr)) => Ok(expr),
+
+                    // No expression prefix associated with this word
+                    Ok(None) => 
Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?),
+
+                    // If parsing of the word as a special expression failed, 
we are facing two options:
+                    // 1. The statement is malformed, e.g. `SELECT INTERVAL '1 
DAI`
+                    // 2. The word is used as an identifier, e.g. `SELECT 
MAX(interval) FROM tbl`
+                    // We first try to parse the word as an identifier and if 
that fails
+                    // we rollback and return the parsing error we got from 
trying to parse a
+                    // special expression (to maintain backwards compatibility 
of parsing errors).
+                    Err(e) => {
+                        if !self.dialect.is_reserved_for_identifier(w.keyword) 
{
+                            if let Ok(expr) = 
self.maybe_parse_internal(|parser| {

Review Comment:
   ```suggestion
                               if let Some(expr) = self.maybe_parse(|parser| {
   ```
   it looks like we can use the normal maybe_parse here since it doesn't have 
the special requirement?



##########
src/parser/mod.rs:
##########
@@ -1057,176 +1234,258 @@ impl<'a> Parser<'a> {
 
         let next_token = self.next_token();
         let expr = match next_token.token {
-            Token::Word(w) => match w.keyword {
-                Keyword::TRUE | Keyword::FALSE if 
self.dialect.supports_boolean_literals() => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::NULL => {
-                    self.prev_token();
-                    Ok(Expr::Value(self.parse_value()?))
-                }
-                Keyword::CURRENT_CATALOG
-                | Keyword::CURRENT_USER
-                | Keyword::SESSION_USER
-                | Keyword::USER
-                    if dialect_of!(self is PostgreSqlDialect | GenericDialect) 
=>
-                {
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::None,
-                        null_treatment: None,
-                        filter: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::CURRENT_TIMESTAMP
-                | Keyword::CURRENT_TIME
-                | Keyword::CURRENT_DATE
-                | Keyword::LOCALTIME
-                | Keyword::LOCALTIMESTAMP => {
-                    self.parse_time_functions(ObjectName(vec![w.to_ident()]))
-                }
-                Keyword::CASE => self.parse_case_expr(),
-                Keyword::CONVERT => self.parse_convert_expr(false),
-                Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => 
self.parse_convert_expr(true),
-                Keyword::CAST => self.parse_cast_expr(CastKind::Cast),
-                Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast),
-                Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast),
-                Keyword::EXISTS
-                    // Support parsing Databricks has a function named 
`exists`.
-                    if !dialect_of!(self is DatabricksDialect)
-                        || matches!(
-                            self.peek_nth_token(1).token,
-                            Token::Word(Word {
-                                keyword: Keyword::SELECT | Keyword::WITH,
-                                ..
-                            })
-                        ) =>
-                {
-                    self.parse_exists_expr(false)
-                }
-                Keyword::EXTRACT => self.parse_extract_expr(),
-                Keyword::CEIL => self.parse_ceil_floor_expr(true),
-                Keyword::FLOOR => self.parse_ceil_floor_expr(false),
-                Keyword::POSITION if self.peek_token().token == Token::LParen 
=> {
-                    self.parse_position_expr(w.to_ident())
-                }
-                Keyword::SUBSTRING => self.parse_substring_expr(),
-                Keyword::OVERLAY => self.parse_overlay_expr(),
-                Keyword::TRIM => self.parse_trim_expr(),
-                Keyword::INTERVAL => self.parse_interval(),
-                // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as 
subquery or a function call
-                Keyword::ARRAY if self.peek_token() == Token::LBracket => {
-                    self.expect_token(&Token::LBracket)?;
-                    self.parse_array_expr(true)
-                }
-                Keyword::ARRAY
-                    if self.peek_token() == Token::LParen
-                        && !dialect_of!(self is ClickHouseDialect | 
DatabricksDialect) =>
+            Token::Word(w) => {
+                // The word we consumed may fall into one of two cases: it has 
a special meaning, or not.
+                // For example, in Snowflake, the word `interval` may have two 
meanings depending on the context:
+                // `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) 
FROM tbl;`
+                //                          ^^^^^^^^^^^^^^^^      ^^^^^^^^
+                //                         interval expression   identifier
+                //
+                // We first try to parse the word and following tokens as a 
special expression, and if that fails,
+                // we rollback and try to parse it as an identifier.
+                match self
+                    .maybe_parse_internal(|parser| 
parser.parse_expr_prefix_by_reserved_word(&w))
                 {
-                    self.expect_token(&Token::LParen)?;
-                    let query = self.parse_query()?;
-                    self.expect_token(&Token::RParen)?;
-                    Ok(Expr::Function(Function {
-                        name: ObjectName(vec![w.to_ident()]),
-                        parameters: FunctionArguments::None,
-                        args: FunctionArguments::Subquery(query),
-                        filter: None,
-                        null_treatment: None,
-                        over: None,
-                        within_group: vec![],
-                    }))
-                }
-                Keyword::NOT => self.parse_not(),
-                Keyword::MATCH if dialect_of!(self is MySqlDialect | 
GenericDialect) => {
-                    self.parse_match_against()
-                }
-                Keyword::STRUCT if dialect_of!(self is BigQueryDialect | 
GenericDialect) => {
-                    self.prev_token();
-                    self.parse_bigquery_struct_literal()
-                }
-                Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) 
=> {
-                    let expr = 
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?;
-                    Ok(Expr::Prior(Box::new(expr)))
-                }
-                Keyword::MAP if self.peek_token() == Token::LBrace && 
self.dialect.support_map_literal_syntax() => {
-                    self.parse_duckdb_map_literal()
-                }
-                // Here `w` is a word, check if it's a part of a multipart
-                // identifier, a function call, or a simple identifier:
-                _ => match self.peek_token().token {
-                    Token::LParen | Token::Period => {
-                        let mut id_parts: Vec<Ident> = vec![w.to_ident()];
-                        let mut ends_with_wildcard = false;
-                        while self.consume_token(&Token::Period) {
-                            let next_token = self.next_token();
-                            match next_token.token {
-                                Token::Word(w) => id_parts.push(w.to_ident()),
-                                Token::Mul => {
-                                    // Postgres explicitly allows 
funcnm(tablenm.*) and the
-                                    // function array_agg traverses this 
control flow
-                                    if dialect_of!(self is PostgreSqlDialect) {
-                                        ends_with_wildcard = true;
-                                        break;
-                                    } else {
-                                        return self
-                                            .expected("an identifier after 
'.'", next_token);
-                                    }
-                                }
-                                Token::SingleQuotedString(s) => {
-                                    id_parts.push(Ident::with_quote('\'', s))
-                                }
-                                _ => {
-                                    return self
-                                        .expected("an identifier or a '*' 
after '.'", next_token);
-                                }
+                    // This word indicated an expression prefix and parsing 
was successful
+                    Ok(Some(expr)) => Ok(expr),
+
+                    // No expression prefix associated with this word
+                    Ok(None) => 
Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?),
+
+                    // If parsing of the word as a special expression failed, 
we are facing two options:
+                    // 1. The statement is malformed, e.g. `SELECT INTERVAL '1 
DAI`
+                    // 2. The word is used as an identifier, e.g. `SELECT 
MAX(interval) FROM tbl`
+                    // We first try to parse the word as an identifier and if 
that fails
+                    // we rollback and return the parsing error we got from 
trying to parse a
+                    // special expression (to maintain backwards compatibility 
of parsing errors).
+                    Err(e) => {
+                        if !self.dialect.is_reserved_for_identifier(w.keyword) 
{
+                            if let Ok(expr) = 
self.maybe_parse_internal(|parser| {
+                                
parser.parse_expr_prefix_by_unnreserved_word(&w)
+                            }) {
+                                return Ok(expr);
                             }
                         }
+                        return Err(e);
+                    }
+                }
+            } // End of Token::Word
+            // array `[1, 2, 3]`
+            Token::LBracket => self.parse_array_expr(false),
+            tok @ Token::Minus | tok @ Token::Plus => {
+                let op = if tok == Token::Plus {
+                    UnaryOperator::Plus
+                } else {
+                    UnaryOperator::Minus
+                };
+                Ok(Expr::UnaryOp {
+                    op,
+                    expr: Box::new(
+                        
self.parse_subexpr(self.dialect.prec_value(Precedence::MulDivModOp))?,
+                    ),
+                })
+            }
+            Token::ExclamationMark if 
self.dialect.supports_bang_not_operator() => {
+                Ok(Expr::UnaryOp {
+                    op: UnaryOperator::BangNot,
+                    expr: Box::new(
+                        
self.parse_subexpr(self.dialect.prec_value(Precedence::UnaryNot))?,
+                    ),
+                })
+            }
+            tok @ Token::DoubleExclamationMark
+            | tok @ Token::PGSquareRoot
+            | tok @ Token::PGCubeRoot
+            | tok @ Token::AtSign
+            | tok @ Token::Tilde
+                if dialect_of!(self is PostgreSqlDialect) =>
+            {
+                let op = match tok {
+                    Token::DoubleExclamationMark => 
UnaryOperator::PGPrefixFactorial,
+                    Token::PGSquareRoot => UnaryOperator::PGSquareRoot,
+                    Token::PGCubeRoot => UnaryOperator::PGCubeRoot,
+                    Token::AtSign => UnaryOperator::PGAbs,
+                    Token::Tilde => UnaryOperator::PGBitwiseNot,
+                    _ => unreachable!(),
+                };
+                Ok(Expr::UnaryOp {
+                    op,
+                    expr: Box::new(
+                        
self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?,
+                    ),
+                })
+            }
+            Token::EscapedStringLiteral(_) if dialect_of!(self is 
PostgreSqlDialect | GenericDialect) =>
+            {
+                self.prev_token();
+                Ok(Expr::Value(self.parse_value()?))
+            }
+            Token::UnicodeStringLiteral(_) => {
+                self.prev_token();
+                Ok(Expr::Value(self.parse_value()?))
+            }
+            Token::Number(_, _)
+            | Token::SingleQuotedString(_)
+            | Token::DoubleQuotedString(_)
+            | Token::TripleSingleQuotedString(_)
+            | Token::TripleDoubleQuotedString(_)
+            | Token::DollarQuotedString(_)
+            | Token::SingleQuotedByteStringLiteral(_)
+            | Token::DoubleQuotedByteStringLiteral(_)
+            | Token::TripleSingleQuotedByteStringLiteral(_)
+            | Token::TripleDoubleQuotedByteStringLiteral(_)
+            | Token::SingleQuotedRawStringLiteral(_)
+            | Token::DoubleQuotedRawStringLiteral(_)
+            | Token::TripleSingleQuotedRawStringLiteral(_)
+            | Token::TripleDoubleQuotedRawStringLiteral(_)
+            | Token::NationalStringLiteral(_)
+            | Token::HexStringLiteral(_) => {
+                self.prev_token();
+                Ok(Expr::Value(self.parse_value()?))
+            }
+            Token::LParen => {
+                let expr = if let Some(expr) = 
self.try_parse_expr_sub_query()? {
+                    expr
+                } else if let Some(lambda) = self.try_parse_lambda()? {
+                    return Ok(lambda);
+                } else {
+                    let exprs = 
self.parse_comma_separated(Parser::parse_expr)?;
+                    match exprs.len() {
+                        0 => unreachable!(), // parse_comma_separated ensures 
1 or more
+                        1 => 
Expr::Nested(Box::new(exprs.into_iter().next().unwrap())),
+                        _ => Expr::Tuple(exprs),
+                    }
+                };
+                self.expect_token(&Token::RParen)?;
+                let expr = self.try_parse_method(expr)?;
+                if !self.consume_token(&Token::Period) {
+                    Ok(expr)
+                } else {
+                    let tok = self.next_token();
+                    let key = match tok.token {
+                        Token::Word(word) => word.to_ident(),
+                        _ => {
+                            return parser_err!(
+                                format!("Expected identifier, found: {tok}"),
+                                tok.location
+                            )
+                        }
+                    };
+                    Ok(Expr::CompositeAccess {
+                        expr: Box::new(expr),
+                        key,
+                    })
+                }
+            }
+            Token::Placeholder(_) | Token::Colon | Token::AtSign => {
+                self.prev_token();
+                Ok(Expr::Value(self.parse_value()?))
+            }
+            Token::LBrace if self.dialect.supports_dictionary_syntax() => {
+                self.prev_token();
+                self.parse_duckdb_struct_literal()
+            }
+            _ => self.expected("an expression", next_token),
+        }?;
 
-                        if ends_with_wildcard {
-                            Ok(Expr::QualifiedWildcard(ObjectName(id_parts)))
-                        } else if self.consume_token(&Token::LParen) {
-                            if dialect_of!(self is SnowflakeDialect | 
MsSqlDialect)
-                                && self.consume_tokens(&[Token::Plus, 
Token::RParen])
-                            {
-                                Ok(Expr::OuterJoin(Box::new(
-                                    match <[Ident; 1]>::try_from(id_parts) {
-                                        Ok([ident]) => Expr::Identifier(ident),
-                                        Err(parts) => 
Expr::CompoundIdentifier(parts),
-                                    },
-                                )))
-                            } else {
-                                self.prev_token();
-                                self.parse_function(ObjectName(id_parts))
+        let expr = self.try_parse_method(expr)?;
+
+        if self.parse_keyword(Keyword::COLLATE) {
+            Ok(Expr::Collate {
+                expr: Box::new(expr),
+                collation: self.parse_object_name(false)?,
+            })
+        } else {
+            Ok(expr)
+        }
+    }
+
+    /// Parse an expression prefix.
+    pub fn parse_prefix2(&mut self) -> Result<Expr, ParserError> {

Review Comment:
   Ah this version seems unused we can remove?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Fallback to identifier parsing if expression parsing fails [datafusion-sqlparser-rs]

Reply via email to