This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new c0998832 Correctly tokenize nested comments in Databricks, Clickhouse, 
and ANSI (#2044)
c0998832 is described below

commit c0998832a2f018915ad4464db6aefa5b1e2f4447
Author: Joey Hain <[email protected]>
AuthorDate: Fri Sep 26 01:05:23 2025 -0700

    Correctly tokenize nested comments in Databricks, Clickhouse, and ANSI 
(#2044)
---
 src/dialect/ansi.rs       |   5 ++
 src/dialect/clickhouse.rs |   6 ++
 src/dialect/databricks.rs |   5 ++
 src/tokenizer.rs          | 141 +++++++++++++++++++++-------------------------
 4 files changed, 81 insertions(+), 76 deletions(-)

diff --git a/src/dialect/ansi.rs b/src/dialect/ansi.rs
index 32ba7b32..ec3c095b 100644
--- a/src/dialect/ansi.rs
+++ b/src/dialect/ansi.rs
@@ -33,4 +33,9 @@ impl Dialect for AnsiDialect {
     fn require_interval_qualifier(&self) -> bool {
         true
     }
+
+    /// The SQL standard explicitly states that block comments nest.
+    fn supports_nested_comments(&self) -> bool {
+        true
+    }
 }
diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs
index f5e70c30..bdac1f57 100644
--- a/src/dialect/clickhouse.rs
+++ b/src/dialect/clickhouse.rs
@@ -94,4 +94,10 @@ impl Dialect for ClickHouseDialect {
     fn supports_group_by_with_modifier(&self) -> bool {
         true
     }
+
+    /// Supported since 2020.
+    /// See 
<https://clickhouse.com/docs/whats-new/changelog/2020#backward-incompatible-change-2>
+    fn supports_nested_comments(&self) -> bool {
+        true
+    }
 }
diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs
index a3476b1b..4bb8c8d5 100644
--- a/src/dialect/databricks.rs
+++ b/src/dialect/databricks.rs
@@ -64,4 +64,9 @@ impl Dialect for DatabricksDialect {
     fn supports_struct_literal(&self) -> bool {
         true
     }
+
+    /// See 
<https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-comment>
+    fn supports_nested_comments(&self) -> bool {
+        true
+    }
 }
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 8382a534..54a158c1 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -2419,7 +2419,7 @@ mod tests {
     use crate::dialect::{
         BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, 
MySqlDialect, SQLiteDialect,
     };
-    use crate::test_utils::all_dialects_where;
+    use crate::test_utils::{all_dialects_except, all_dialects_where};
     use core::fmt::Debug;
 
     #[test]
@@ -3169,90 +3169,79 @@ mod tests {
 
     #[test]
     fn tokenize_nested_multiline_comment() {
-        let dialect = GenericDialect {};
-        let test_cases = vec![
-            (
-                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ 
/comment*/1",
-                vec![
-                    Token::Number("0".to_string(), false),
-                    Token::Whitespace(Whitespace::MultiLineComment(
-                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
-                    )),
-                    Token::Whitespace(Whitespace::Space),
-                    Token::Div,
-                    Token::Word(Word {
-                        value: "comment".to_string(),
-                        quote_style: None,
-                        keyword: Keyword::COMMENT,
-                    }),
-                    Token::Mul,
-                    Token::Div,
-                    Token::Number("1".to_string(), false),
-                ],
-            ),
-            (
-                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ 
/comment*/*/1",
-                vec![
-                    Token::Number("0".to_string(), false),
-                    Token::Whitespace(Whitespace::MultiLineComment(
-                        "multi-line\n* \n/* comment \n /*comment/**/ */ 
/comment*/".into(),
-                    )),
-                    Token::Number("1".to_string(), false),
-                ],
-            ),
-            (
-                "SELECT 1/* a /* b */ c */0",
-                vec![
-                    Token::make_keyword("SELECT"),
-                    Token::Whitespace(Whitespace::Space),
-                    Token::Number("1".to_string(), false),
-                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ 
c ".to_string())),
-                    Token::Number("0".to_string(), false),
-                ],
-            ),
-        ];
+        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
+            "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
+            vec![
+                Token::Number("0".to_string(), false),
+                Token::Whitespace(Whitespace::MultiLineComment(
+                    "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
+                )),
+                Token::Whitespace(Whitespace::Space),
+                Token::Div,
+                Token::Word(Word {
+                    value: "comment".to_string(),
+                    quote_style: None,
+                    keyword: Keyword::COMMENT,
+                }),
+                Token::Mul,
+                Token::Div,
+                Token::Number("1".to_string(), false),
+            ],
+        );
 
-        for (sql, expected) in test_cases {
-            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
-            compare(expected, tokens);
-        }
+        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
+            "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
+            vec![
+                Token::Number("0".to_string(), false),
+                Token::Whitespace(Whitespace::MultiLineComment(
+                    "multi-line\n* \n/* comment \n /*comment/**/ */ 
/comment*/".into(),
+                )),
+                Token::Number("1".to_string(), false),
+            ],
+        );
+
+        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
+            "SELECT 1/* a /* b */ c */0",
+            vec![
+                Token::make_keyword("SELECT"),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+                Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c 
".to_string())),
+                Token::Number("0".to_string(), false),
+            ],
+        );
     }
 
     #[test]
     fn tokenize_nested_multiline_comment_empty() {
-        let sql = "select 1/*/**/*/0";
-
-        let dialect = GenericDialect {};
-        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
-        let expected = vec![
-            Token::make_keyword("select"),
-            Token::Whitespace(Whitespace::Space),
-            Token::Number("1".to_string(), false),
-            
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
-            Token::Number("0".to_string(), false),
-        ];
-
-        compare(expected, tokens);
+        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
+            "select 1/*/**/*/0",
+            vec![
+                Token::make_keyword("select"),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+                
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
+                Token::Number("0".to_string(), false),
+            ],
+        );
     }
 
     #[test]
     fn tokenize_nested_comments_if_not_supported() {
-        let dialect = SQLiteDialect {};
-        let sql = "SELECT 1/*/* nested comment */*/0";
-        let tokens = Tokenizer::new(&dialect, sql).tokenize();
-        let expected = vec![
-            Token::make_keyword("SELECT"),
-            Token::Whitespace(Whitespace::Space),
-            Token::Number("1".to_string(), false),
-            Token::Whitespace(Whitespace::MultiLineComment(
-                "/* nested comment ".to_string(),
-            )),
-            Token::Mul,
-            Token::Div,
-            Token::Number("0".to_string(), false),
-        ];
-
-        compare(expected, tokens.unwrap());
+        all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
+            "SELECT 1/*/* nested comment */*/0",
+            vec![
+                Token::make_keyword("SELECT"),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+                Token::Whitespace(Whitespace::MultiLineComment(
+                    "/* nested comment ".to_string(),
+                )),
+                Token::Mul,
+                Token::Div,
+                Token::Number("0".to_string(), false),
+            ],
+        );
     }
 
     #[test]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to