(datafusion-sqlparser-rs) 01/01: Add support for C-style comments (#2034)

github-bot Fri, 06 Feb 2026 07:42:36 -0800

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch 
gh-readonly-queue/main/pr-2034-e4c550002f6b3d68ed6698e0a8ac6bce633607d6
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git


commit 23acd2376698badf0d7f4e5ed818ff606b5357a4
Author: Marcelo Altmann <[email protected]>
AuthorDate: Fri Feb 6 21:10:42 2026 +0530

    Add support for C-style comments (#2034)
---
 src/dialect/generic.rs |   4 ++
 src/dialect/mod.rs     |   6 +++
 src/dialect/mysql.rs   |   5 ++
 src/tokenizer.rs       | 142 +++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 154 insertions(+), 3 deletions(-)

diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
index 38f12cc8..5e929d73 100644
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@@ -177,6 +177,10 @@ impl Dialect for GenericDialect {
         true
     }
 
+    fn supports_multiline_comment_hints(&self) -> bool {
+        true
+    }
+
     fn supports_user_host_grantee(&self) -> bool {
         true
     }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index 15a9c2d1..d0b87d96 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -1099,6 +1099,12 @@ pub trait Dialect: Debug + Any {
         false
     }
 
+    /// Returns true if the dialect supports optimizer hints in multiline 
comments
+    /// e.g. `/*!50110 KEY_BLOCK_SIZE = 1024*/`
+    fn supports_multiline_comment_hints(&self) -> bool {
+        false
+    }
+
     /// Returns true if this dialect supports treating the equals operator `=` 
within a `SelectItem`
     /// as an alias assignment operator, rather than a boolean expression.
     /// For example: the following statements are equivalent for such a 
dialect:
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
index e08c1c78..51a43f89 100644
--- a/src/dialect/mysql.rs
+++ b/src/dialect/mysql.rs
@@ -89,6 +89,11 @@ impl Dialect for MySqlDialect {
         true
     }
 
+    /// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
+    fn supports_multiline_comment_hints(&self) -> bool {
+        true
+    }
+
     fn parse_infix(
         &self,
         parser: &mut crate::parser::Parser,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 506dee1d..cc5a2aa1 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -945,10 +945,65 @@ impl<'a> Tokenizer<'a> {
         while let Some(token) = self.next_token(&mut state, buf.last().map(|t| 
&t.token))? {
             let span = location.span_to(state.location());
 
-            buf.push(TokenWithSpan { token, span });
+            // Check if this is a multiline comment hint that should be 
expanded
+            match &token {
+                Token::Whitespace(Whitespace::MultiLineComment(comment))
+                    if self.dialect.supports_multiline_comment_hints()
+                        && comment.starts_with('!') =>
+                {
+                    // Re-tokenize the hints and add them to the buffer
+                    self.tokenize_comment_hints(comment, span, buf)?;
+                }
+                _ => {
+                    buf.push(TokenWithSpan { token, span });
+                }
+            }
+
+            location = state.location();
+        }
+        Ok(())
+    }
+
+    /// Re-tokenize optimizer hints from a multiline comment and add them to 
the buffer.
+    /// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for 
`KEY_BLOCK_SIZE = 1024`
+    fn tokenize_comment_hints(
+        &self,
+        comment: &str,
+        span: Span,
+        buf: &mut Vec<TokenWithSpan>,
+    ) -> Result<(), TokenizerError> {
+        // Strip the leading '!' and any version digits (e.g., "50110")
+        let hint_content = comment
+            .strip_prefix('!')
+            .unwrap_or(comment)
+            .trim_start_matches(|c: char| c.is_ascii_digit());
+
+        // If there's no content after stripping, nothing to tokenize
+        if hint_content.is_empty() {
+            return Ok(());
+        }
+
+        // Create a new tokenizer for the hint content
+        let inner = Tokenizer::new(self.dialect, 
hint_content).with_unescape(self.unescape);
+
+        // Create a state for tracking position within the hint
+        let mut state = State {
+            peekable: hint_content.chars().peekable(),
+            line: span.start.line,
+            col: span.start.column,
+        };
 
+        // Tokenize the hint content and add tokens to the buffer
+        let mut location = state.location();
+        while let Some(token) = inner.next_token(&mut state, 
buf.last().map(|t| &t.token))? {
+            let token_span = location.span_to(state.location());
+            buf.push(TokenWithSpan {
+                token,
+                span: token_span,
+            });
             location = state.location();
         }
+
         Ok(())
     }
 
@@ -2233,7 +2288,6 @@ impl<'a> Tokenizer<'a> {
         let mut s = String::new();
         let mut nested = 1;
         let supports_nested_comments = self.dialect.supports_nested_comments();
-
         loop {
             match chars.next() {
                 Some('/') if matches!(chars.peek(), Some('*')) && 
supports_nested_comments => {
@@ -4218,6 +4272,88 @@ mod tests {
                 Token::Whitespace(Whitespace::Space),
                 Token::make_word("y", None),
             ],
-        )
+        );
+    }
+
+    #[test]
+    fn tokenize_multiline_comment_with_comment_hint() {
+        let sql = String::from("0/*! word */1");
+
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Word(Word {
+                value: "word".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_multiline_comment_with_comment_hint_and_version() {
+        let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Whitespace(Whitespace::Space),
+            Token::Word(Word {
+                value: "KEY_BLOCK_SIZE".to_string(),
+                quote_style: None,
+                keyword: Keyword::KEY_BLOCK_SIZE,
+            }),
+            Token::Whitespace(Whitespace::Space),
+            Token::Eq,
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1024".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+
+        let tokens = Tokenizer::new(&dialect, "0 /*!50110 */ 1")
+            .tokenize()
+            .unwrap();
+        compare(
+            vec![
+                Token::Number("0".to_string(), false),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+            ],
+            tokens,
+        );
+
+        let tokens = Tokenizer::new(&dialect, "0 /*!*/ 1").tokenize().unwrap();
+        compare(
+            vec![
+                Token::Number("0".to_string(), false),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+            ],
+            tokens,
+        );
+        let tokens = Tokenizer::new(&dialect, "0 /*!   */ 
1").tokenize().unwrap();
+        compare(
+            vec![
+                Token::Number("0".to_string(), false),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("1".to_string(), false),
+            ],
+            tokens,
+        );
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion-sqlparser-rs) 01/01: Add support for C-style comments (#2034)

Reply via email to