This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git
The following commit(s) were added to refs/heads/main by this push:
new c0998832 Correctly tokenize nested comments in Databricks, Clickhouse,
and ANSI (#2044)
c0998832 is described below
commit c0998832a2f018915ad4464db6aefa5b1e2f4447
Author: Joey Hain <[email protected]>
AuthorDate: Fri Sep 26 01:05:23 2025 -0700
Correctly tokenize nested comments in Databricks, Clickhouse, and ANSI
(#2044)
---
src/dialect/ansi.rs | 5 ++
src/dialect/clickhouse.rs | 6 ++
src/dialect/databricks.rs | 5 ++
src/tokenizer.rs | 141 +++++++++++++++++++++-------------------------
4 files changed, 81 insertions(+), 76 deletions(-)
diff --git a/src/dialect/ansi.rs b/src/dialect/ansi.rs
index 32ba7b32..ec3c095b 100644
--- a/src/dialect/ansi.rs
+++ b/src/dialect/ansi.rs
@@ -33,4 +33,9 @@ impl Dialect for AnsiDialect {
fn require_interval_qualifier(&self) -> bool {
true
}
+
+ /// The SQL standard explicitly states that block comments nest.
+ fn supports_nested_comments(&self) -> bool {
+ true
+ }
}
diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs
index f5e70c30..bdac1f57 100644
--- a/src/dialect/clickhouse.rs
+++ b/src/dialect/clickhouse.rs
@@ -94,4 +94,10 @@ impl Dialect for ClickHouseDialect {
fn supports_group_by_with_modifier(&self) -> bool {
true
}
+
+ /// Supported since 2020.
+ /// See
<https://clickhouse.com/docs/whats-new/changelog/2020#backward-incompatible-change-2>
+ fn supports_nested_comments(&self) -> bool {
+ true
+ }
}
diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs
index a3476b1b..4bb8c8d5 100644
--- a/src/dialect/databricks.rs
+++ b/src/dialect/databricks.rs
@@ -64,4 +64,9 @@ impl Dialect for DatabricksDialect {
fn supports_struct_literal(&self) -> bool {
true
}
+
+ /// See
<https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-comment>
+ fn supports_nested_comments(&self) -> bool {
+ true
+ }
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 8382a534..54a158c1 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -2419,7 +2419,7 @@ mod tests {
use crate::dialect::{
BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect,
MySqlDialect, SQLiteDialect,
};
- use crate::test_utils::all_dialects_where;
+ use crate::test_utils::{all_dialects_except, all_dialects_where};
use core::fmt::Debug;
#[test]
@@ -3169,90 +3169,79 @@ mod tests {
#[test]
fn tokenize_nested_multiline_comment() {
- let dialect = GenericDialect {};
- let test_cases = vec![
- (
- "0/*multi-line\n* \n/* comment \n /*comment*/*/ */
/comment*/1",
- vec![
- Token::Number("0".to_string(), false),
- Token::Whitespace(Whitespace::MultiLineComment(
- "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
- )),
- Token::Whitespace(Whitespace::Space),
- Token::Div,
- Token::Word(Word {
- value: "comment".to_string(),
- quote_style: None,
- keyword: Keyword::COMMENT,
- }),
- Token::Mul,
- Token::Div,
- Token::Number("1".to_string(), false),
- ],
- ),
- (
- "0/*multi-line\n* \n/* comment \n /*comment/**/ */
/comment*/*/1",
- vec![
- Token::Number("0".to_string(), false),
- Token::Whitespace(Whitespace::MultiLineComment(
- "multi-line\n* \n/* comment \n /*comment/**/ */
/comment*/".into(),
- )),
- Token::Number("1".to_string(), false),
- ],
- ),
- (
- "SELECT 1/* a /* b */ c */0",
- vec![
- Token::make_keyword("SELECT"),
- Token::Whitespace(Whitespace::Space),
- Token::Number("1".to_string(), false),
- Token::Whitespace(Whitespace::MultiLineComment(" a /* b */
c ".to_string())),
- Token::Number("0".to_string(), false),
- ],
- ),
- ];
+ all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
+ "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
+ vec![
+ Token::Number("0".to_string(), false),
+ Token::Whitespace(Whitespace::MultiLineComment(
+ "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
+ )),
+ Token::Whitespace(Whitespace::Space),
+ Token::Div,
+ Token::Word(Word {
+ value: "comment".to_string(),
+ quote_style: None,
+ keyword: Keyword::COMMENT,
+ }),
+ Token::Mul,
+ Token::Div,
+ Token::Number("1".to_string(), false),
+ ],
+ );
- for (sql, expected) in test_cases {
- let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
- compare(expected, tokens);
- }
+ all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
+ "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
+ vec![
+ Token::Number("0".to_string(), false),
+ Token::Whitespace(Whitespace::MultiLineComment(
+ "multi-line\n* \n/* comment \n /*comment/**/ */
/comment*/".into(),
+ )),
+ Token::Number("1".to_string(), false),
+ ],
+ );
+
+ all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
+ "SELECT 1/* a /* b */ c */0",
+ vec![
+ Token::make_keyword("SELECT"),
+ Token::Whitespace(Whitespace::Space),
+ Token::Number("1".to_string(), false),
+ Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c
".to_string())),
+ Token::Number("0".to_string(), false),
+ ],
+ );
}
#[test]
fn tokenize_nested_multiline_comment_empty() {
- let sql = "select 1/*/**/*/0";
-
- let dialect = GenericDialect {};
- let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
- let expected = vec![
- Token::make_keyword("select"),
- Token::Whitespace(Whitespace::Space),
- Token::Number("1".to_string(), false),
-
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
- Token::Number("0".to_string(), false),
- ];
-
- compare(expected, tokens);
+ all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
+ "select 1/*/**/*/0",
+ vec![
+ Token::make_keyword("select"),
+ Token::Whitespace(Whitespace::Space),
+ Token::Number("1".to_string(), false),
+
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
+ Token::Number("0".to_string(), false),
+ ],
+ );
}
#[test]
fn tokenize_nested_comments_if_not_supported() {
- let dialect = SQLiteDialect {};
- let sql = "SELECT 1/*/* nested comment */*/0";
- let tokens = Tokenizer::new(&dialect, sql).tokenize();
- let expected = vec![
- Token::make_keyword("SELECT"),
- Token::Whitespace(Whitespace::Space),
- Token::Number("1".to_string(), false),
- Token::Whitespace(Whitespace::MultiLineComment(
- "/* nested comment ".to_string(),
- )),
- Token::Mul,
- Token::Div,
- Token::Number("0".to_string(), false),
- ];
-
- compare(expected, tokens.unwrap());
+ all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
+ "SELECT 1/*/* nested comment */*/0",
+ vec![
+ Token::make_keyword("SELECT"),
+ Token::Whitespace(Whitespace::Space),
+ Token::Number("1".to_string(), false),
+ Token::Whitespace(Whitespace::MultiLineComment(
+ "/* nested comment ".to_string(),
+ )),
+ Token::Mul,
+ Token::Div,
+ Token::Number("0".to_string(), false),
+ ],
+ );
}
#[test]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]