This is an automated email from the ASF dual-hosted git repository.

iffyio pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 8bc63f0e Correctly tokenize nested comments (#1629)
8bc63f0e is described below

commit 8bc63f0e4a01b3b8a2e694e42a99dc3665a06b8e
Author: Hans Ott <[email protected]>
AuthorDate: Sun Jan 5 15:37:34 2025 +0100

    Correctly tokenize nested comments (#1629)
---
 src/dialect/generic.rs    |   4 ++
 src/dialect/mod.rs        |   6 +++
 src/dialect/postgresql.rs |   4 ++
 src/tokenizer.rs          | 111 +++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 108 insertions(+), 17 deletions(-)

diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
index f852152a..e2a73de8 100644
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@@ -131,4 +131,8 @@ impl Dialect for GenericDialect {
     fn supports_empty_projections(&self) -> bool {
         true
     }
+
+    fn supports_nested_comments(&self) -> bool {
+        true
+    }
 }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index 1343efca..9ffbd8ed 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -682,6 +682,12 @@ pub trait Dialect: Debug + Any {
         false
     }
 
+    /// Returns true if the dialect supports nested comments
+    /// e.g. `/* /* nested */ */`
+    fn supports_nested_comments(&self) -> bool {
+        false
+    }
+
     /// Returns true if this dialect supports treating the equals operator `=` 
within a `SelectItem`
     /// as an alias assignment operator, rather than a boolean expression.
     /// For example: the following statements are equivalent for such a 
dialect:
diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs
index 6a13a386..170b0a7c 100644
--- a/src/dialect/postgresql.rs
+++ b/src/dialect/postgresql.rs
@@ -241,6 +241,10 @@ impl Dialect for PostgreSqlDialect {
     fn supports_empty_projections(&self) -> bool {
         true
     }
+
+    fn supports_nested_comments(&self) -> bool {
+        true
+    }
 }
 
 pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, 
ParserError>> {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index da61303b..38bd33d6 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1855,28 +1855,33 @@ impl<'a> Tokenizer<'a> {
     ) -> Result<Option<Token>, TokenizerError> {
         let mut s = String::new();
         let mut nested = 1;
-        let mut last_ch = ' ';
+        let supports_nested_comments = self.dialect.supports_nested_comments();
 
         loop {
             match chars.next() {
-                Some(ch) => {
-                    if last_ch == '/' && ch == '*' {
-                        nested += 1;
-                    } else if last_ch == '*' && ch == '/' {
-                        nested -= 1;
-                        if nested == 0 {
-                            s.pop();
-                            break 
Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
-                        }
+                Some('/') if matches!(chars.peek(), Some('*')) && 
supports_nested_comments => {
+                    chars.next(); // consume the '*'
+                    s.push('/');
+                    s.push('*');
+                    nested += 1;
+                }
+                Some('*') if matches!(chars.peek(), Some('/')) => {
+                    chars.next(); // consume the '/'
+                    nested -= 1;
+                    if nested == 0 {
+                        break 
Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                     }
+                    s.push('*');
+                    s.push('/');
+                }
+                Some(ch) => {
                     s.push(ch);
-                    last_ch = ch;
                 }
                 None => {
                     break self.tokenizer_error(
                         chars.location(),
                         "Unexpected EOF while in a multi-line comment",
-                    )
+                    );
                 }
             }
         }
@@ -2718,18 +2723,90 @@ mod tests {
 
     #[test]
     fn tokenize_nested_multiline_comment() {
-        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ 
*/ /comment*/1");
+        let dialect = GenericDialect {};
+        let test_cases = vec![
+            (
+                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ 
/comment*/1",
+                vec![
+                    Token::Number("0".to_string(), false),
+                    Token::Whitespace(Whitespace::MultiLineComment(
+                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
+                    )),
+                    Token::Whitespace(Whitespace::Space),
+                    Token::Div,
+                    Token::Word(Word {
+                        value: "comment".to_string(),
+                        quote_style: None,
+                        keyword: Keyword::COMMENT,
+                    }),
+                    Token::Mul,
+                    Token::Div,
+                    Token::Number("1".to_string(), false),
+                ],
+            ),
+            (
+                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ 
/comment*/*/1",
+                vec![
+                    Token::Number("0".to_string(), false),
+                    Token::Whitespace(Whitespace::MultiLineComment(
+                        "multi-line\n* \n/* comment \n /*comment/**/ */ 
/comment*/".into(),
+                    )),
+                    Token::Number("1".to_string(), false),
+                ],
+            ),
+            (
+                "SELECT 1/* a /* b */ c */0",
+                vec![
+                    Token::make_keyword("SELECT"),
+                    Token::Whitespace(Whitespace::Space),
+                    Token::Number("1".to_string(), false),
+                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ 
c ".to_string())),
+                    Token::Number("0".to_string(), false),
+                ],
+            ),
+        ];
+
+        for (sql, expected) in test_cases {
+            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
+            compare(expected, tokens);
+        }
+    }
+
+    #[test]
+    fn tokenize_nested_multiline_comment_empty() {
+        let sql = "select 1/*/**/*/0";
 
         let dialect = GenericDialect {};
-        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
         let expected = vec![
+            Token::make_keyword("select"),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+            
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
             Token::Number("0".to_string(), false),
+        ];
+
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_nested_comments_if_not_supported() {
+        let dialect = SQLiteDialect {};
+        let sql = "SELECT 1/*/* nested comment */*/0";
+        let tokens = Tokenizer::new(&dialect, sql).tokenize();
+        let expected = vec![
+            Token::make_keyword("SELECT"),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
             Token::Whitespace(Whitespace::MultiLineComment(
-                "multi-line\n* \n/* comment \n /*comment*/*/ */ 
/comment".to_string(),
+                "/* nested comment ".to_string(),
             )),
-            Token::Number("1".to_string(), false),
+            Token::Mul,
+            Token::Div,
+            Token::Number("0".to_string(), false),
         ];
-        compare(expected, tokens);
+
+        compare(expected, tokens.unwrap());
     }
 
     #[test]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to