This is an automated email from the ASF dual-hosted git repository.
iffyio pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 8bc63f0e Correctly tokenize nested comments (#1629)
8bc63f0e is described below
commit 8bc63f0e4a01b3b8a2e694e42a99dc3665a06b8e
Author: Hans Ott <[email protected]>
AuthorDate: Sun Jan 5 15:37:34 2025 +0100
Correctly tokenize nested comments (#1629)
---
src/dialect/generic.rs | 4 ++
src/dialect/mod.rs | 6 +++
src/dialect/postgresql.rs | 4 ++
src/tokenizer.rs | 111 +++++++++++++++++++++++++++++++++++++++-------
4 files changed, 108 insertions(+), 17 deletions(-)
diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
index f852152a..e2a73de8 100644
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@@ -131,4 +131,8 @@ impl Dialect for GenericDialect {
fn supports_empty_projections(&self) -> bool {
true
}
+
+ fn supports_nested_comments(&self) -> bool {
+ true
+ }
}
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index 1343efca..9ffbd8ed 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -682,6 +682,12 @@ pub trait Dialect: Debug + Any {
false
}
+ /// Returns true if the dialect supports nested comments
+ /// e.g. `/* /* nested */ */`
+ fn supports_nested_comments(&self) -> bool {
+ false
+ }
+
/// Returns true if this dialect supports treating the equals operator `=`
within a `SelectItem`
/// as an alias assignment operator, rather than a boolean expression.
/// For example: the following statements are equivalent for such a
dialect:
diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs
index 6a13a386..170b0a7c 100644
--- a/src/dialect/postgresql.rs
+++ b/src/dialect/postgresql.rs
@@ -241,6 +241,10 @@ impl Dialect for PostgreSqlDialect {
fn supports_empty_projections(&self) -> bool {
true
}
+
+ fn supports_nested_comments(&self) -> bool {
+ true
+ }
}
pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement,
ParserError>> {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index da61303b..38bd33d6 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1855,28 +1855,33 @@ impl<'a> Tokenizer<'a> {
) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new();
let mut nested = 1;
- let mut last_ch = ' ';
+ let supports_nested_comments = self.dialect.supports_nested_comments();
loop {
match chars.next() {
- Some(ch) => {
- if last_ch == '/' && ch == '*' {
- nested += 1;
- } else if last_ch == '*' && ch == '/' {
- nested -= 1;
- if nested == 0 {
- s.pop();
- break
Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
- }
+ Some('/') if matches!(chars.peek(), Some('*')) &&
supports_nested_comments => {
+ chars.next(); // consume the '*'
+ s.push('/');
+ s.push('*');
+ nested += 1;
+ }
+ Some('*') if matches!(chars.peek(), Some('/')) => {
+ chars.next(); // consume the '/'
+ nested -= 1;
+ if nested == 0 {
+ break
Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
}
+ s.push('*');
+ s.push('/');
+ }
+ Some(ch) => {
s.push(ch);
- last_ch = ch;
}
None => {
break self.tokenizer_error(
chars.location(),
"Unexpected EOF while in a multi-line comment",
- )
+ );
}
}
}
@@ -2718,18 +2723,90 @@ mod tests {
#[test]
fn tokenize_nested_multiline_comment() {
- let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/
*/ /comment*/1");
+ let dialect = GenericDialect {};
+ let test_cases = vec![
+ (
+ "0/*multi-line\n* \n/* comment \n /*comment*/*/ */
/comment*/1",
+ vec![
+ Token::Number("0".to_string(), false),
+ Token::Whitespace(Whitespace::MultiLineComment(
+ "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
+ )),
+ Token::Whitespace(Whitespace::Space),
+ Token::Div,
+ Token::Word(Word {
+ value: "comment".to_string(),
+ quote_style: None,
+ keyword: Keyword::COMMENT,
+ }),
+ Token::Mul,
+ Token::Div,
+ Token::Number("1".to_string(), false),
+ ],
+ ),
+ (
+ "0/*multi-line\n* \n/* comment \n /*comment/**/ */
/comment*/*/1",
+ vec![
+ Token::Number("0".to_string(), false),
+ Token::Whitespace(Whitespace::MultiLineComment(
+ "multi-line\n* \n/* comment \n /*comment/**/ */
/comment*/".into(),
+ )),
+ Token::Number("1".to_string(), false),
+ ],
+ ),
+ (
+ "SELECT 1/* a /* b */ c */0",
+ vec![
+ Token::make_keyword("SELECT"),
+ Token::Whitespace(Whitespace::Space),
+ Token::Number("1".to_string(), false),
+ Token::Whitespace(Whitespace::MultiLineComment(" a /* b */
c ".to_string())),
+ Token::Number("0".to_string(), false),
+ ],
+ ),
+ ];
+
+ for (sql, expected) in test_cases {
+ let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
+ compare(expected, tokens);
+ }
+ }
+
+ #[test]
+ fn tokenize_nested_multiline_comment_empty() {
+ let sql = "select 1/*/**/*/0";
let dialect = GenericDialect {};
- let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+ let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![
+ Token::make_keyword("select"),
+ Token::Whitespace(Whitespace::Space),
+ Token::Number("1".to_string(), false),
+
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
Token::Number("0".to_string(), false),
+ ];
+
+ compare(expected, tokens);
+ }
+
+ #[test]
+ fn tokenize_nested_comments_if_not_supported() {
+ let dialect = SQLiteDialect {};
+ let sql = "SELECT 1/*/* nested comment */*/0";
+ let tokens = Tokenizer::new(&dialect, sql).tokenize();
+ let expected = vec![
+ Token::make_keyword("SELECT"),
+ Token::Whitespace(Whitespace::Space),
+ Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
- "multi-line\n* \n/* comment \n /*comment*/*/ */
/comment".to_string(),
+ "/* nested comment ".to_string(),
)),
- Token::Number("1".to_string(), false),
+ Token::Mul,
+ Token::Div,
+ Token::Number("0".to_string(), false),
];
- compare(expected, tokens);
+
+ compare(expected, tokens.unwrap());
}
#[test]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]