iffyio commented on code in PR #1835:
URL:
https://github.com/apache/datafusion-sqlparser-rs/pull/1835#discussion_r2072413695
##########
src/tokenizer.rs:
##########
@@ -1281,20 +1262,91 @@ impl<'a> Tokenizer<'a> {
return Ok(Some(Token::make_word(s.as_str(),
None)));
}
} else if prev_token == Some(&Token::Period) {
- // If the previous token was a period, thus not
belonging to a number,
- // the value we have is part of an identifier.
+ // Handle as word if it follows a period
return Ok(Some(Token::make_word(s.as_str(),
None)));
}
}
+ // Handle "L" suffix for long numbers
let long = if chars.peek() == Some(&'L') {
chars.next();
true
} else {
false
};
+
+ // Return the final token for the number
Ok(Some(Token::Number(s, long)))
}
+
+ // Period (`.`) handling
+ '.' => {
+ chars.next(); // consume the dot
+
+ match chars.peek() {
+ // Handle "._" case as a period followed by identifier
+ // if the last token was a word
+ Some('_') if matches!(prev_token,
Some(Token::Word(_))) => {
+ Ok(Some(Token::Period))
+ }
+ Some('_') => {
+ self.tokenizer_error(
+ chars.location(),
+ "Unexpected underscore here".to_string(),
+ )
+ }
+ Some(ch)
+ // Hive and mysql dialects allow numeric prefixes
for identifers
+ if ch.is_ascii_digit()
+ && self.dialect.supports_numeric_prefix()
+ && matches!(prev_token, Some(Token::Word(_)))
=>
+ {
+ Ok(Some(Token::Period))
+ }
+ Some(ch) if ch.is_ascii_digit() => {
+ // Handle numbers starting with a dot (e.g.,
".123")
+ let mut s = String::from(".");
+ let is_number_separator = |ch: char, next_char:
Option<char>| {
+
self.dialect.supports_numeric_literal_underscores()
+ && ch == '_'
+ && next_char.is_some_and(|c|
c.is_ascii_digit())
+ };
+
+ s += &peeking_next_take_while(chars, |ch, next_ch|
{
+ ch.is_ascii_digit() || is_number_separator(ch,
next_ch)
+ });
+
+ // Handle exponent part
+ if matches!(chars.peek(), Some('e' | 'E')) {
+ let mut exp = String::new();
+ exp.push(chars.next().unwrap());
+
+ if matches!(chars.peek(), Some('+' | '-')) {
+ exp.push(chars.next().unwrap());
+ }
+
+ if matches!(chars.peek(), Some(c) if
c.is_ascii_digit()) {
+ exp += &peeking_take_while(chars, |c|
c.is_ascii_digit());
+ s += &exp;
+ }
+ }
+
+ // Handle "L" suffix for long numbers
+ let long = if chars.peek() == Some(&'L') {
+ chars.next();
+ true
Review Comment:
hmm most of this logic looks to already be duplicated on the [number
parsing](https://github.com/apache/datafusion-sqlparser-rs/blob/f05913dd698a1fa2eeebf4f6a286f3cf19ddaff7/src/tokenizer.rs#L1224-L1276)
code path, so that that side effect would be undesirable I think.
If I understood the issue being solved for, its only the case of `._` being
parsed as a number, would it be possible/more-desirable to only update the
existing logic to properly detect and handle that case or is the current logic
not well equipped to handle that sanely?
##########
src/tokenizer.rs:
##########
@@ -1281,20 +1262,91 @@ impl<'a> Tokenizer<'a> {
return Ok(Some(Token::make_word(s.as_str(),
None)));
}
} else if prev_token == Some(&Token::Period) {
- // If the previous token was a period, thus not
belonging to a number,
- // the value we have is part of an identifier.
+ // Handle as word if it follows a period
return Ok(Some(Token::make_word(s.as_str(),
None)));
}
}
+ // Handle "L" suffix for long numbers
let long = if chars.peek() == Some(&'L') {
chars.next();
true
} else {
false
};
+
+ // Return the final token for the number
Ok(Some(Token::Number(s, long)))
}
+
+ // Period (`.`) handling
+ '.' => {
+ chars.next(); // consume the dot
+
+ match chars.peek() {
+ // Handle "._" case as a period followed by identifier
+ // if the last token was a word
+ Some('_') if matches!(prev_token,
Some(Token::Word(_))) => {
+ Ok(Some(Token::Period))
+ }
+ Some('_') => {
+ self.tokenizer_error(
+ chars.location(),
+ "Unexpected underscore here".to_string(),
+ )
+ }
Review Comment:
I wonder if its worth returning an error here or whether we lose anything by
allowing the tokenizer continue? I'm guessing its still possible for the
tokenizer to return `Token::Period` here as well?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]