This is an automated email from the ASF dual-hosted git repository. github-bot pushed a commit to branch gh-readonly-queue/main/pr-2034-e4c550002f6b3d68ed6698e0a8ac6bce633607d6 in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git
commit 23acd2376698badf0d7f4e5ed818ff606b5357a4 Author: Marcelo Altmann <[email protected]> AuthorDate: Fri Feb 6 21:10:42 2026 +0530 Add support for C-style comments (#2034) --- src/dialect/generic.rs | 4 ++ src/dialect/mod.rs | 6 +++ src/dialect/mysql.rs | 5 ++ src/tokenizer.rs | 142 +++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 154 insertions(+), 3 deletions(-) diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs index 38f12cc8..5e929d73 100644 --- a/src/dialect/generic.rs +++ b/src/dialect/generic.rs @@ -177,6 +177,10 @@ impl Dialect for GenericDialect { true } + fn supports_multiline_comment_hints(&self) -> bool { + true + } + fn supports_user_host_grantee(&self) -> bool { true } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 15a9c2d1..d0b87d96 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -1099,6 +1099,12 @@ pub trait Dialect: Debug + Any { false } + /// Returns true if the dialect supports optimizer hints in multiline comments + /// e.g. `/*!50110 KEY_BLOCK_SIZE = 1024*/` + fn supports_multiline_comment_hints(&self) -> bool { + false + } + /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem` /// as an alias assignment operator, rather than a boolean expression. /// For example: the following statements are equivalent for such a dialect: diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs index e08c1c78..51a43f89 100644 --- a/src/dialect/mysql.rs +++ b/src/dialect/mysql.rs @@ -89,6 +89,11 @@ impl Dialect for MySqlDialect { true } + /// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html> + fn supports_multiline_comment_hints(&self) -> bool { + true + } + fn parse_infix( &self, parser: &mut crate::parser::Parser, diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 506dee1d..cc5a2aa1 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -945,10 +945,65 @@ impl<'a> Tokenizer<'a> { while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? { let span = location.span_to(state.location()); - buf.push(TokenWithSpan { token, span }); + // Check if this is a multiline comment hint that should be expanded + match &token { + Token::Whitespace(Whitespace::MultiLineComment(comment)) + if self.dialect.supports_multiline_comment_hints() + && comment.starts_with('!') => + { + // Re-tokenize the hints and add them to the buffer + self.tokenize_comment_hints(comment, span, buf)?; + } + _ => { + buf.push(TokenWithSpan { token, span }); + } + } + + location = state.location(); + } + Ok(()) + } + + /// Re-tokenize optimizer hints from a multiline comment and add them to the buffer. + /// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for `KEY_BLOCK_SIZE = 1024` + fn tokenize_comment_hints( + &self, + comment: &str, + span: Span, + buf: &mut Vec<TokenWithSpan>, + ) -> Result<(), TokenizerError> { + // Strip the leading '!' and any version digits (e.g., "50110") + let hint_content = comment + .strip_prefix('!') + .unwrap_or(comment) + .trim_start_matches(|c: char| c.is_ascii_digit()); + + // If there's no content after stripping, nothing to tokenize + if hint_content.is_empty() { + return Ok(()); + } + + // Create a new tokenizer for the hint content + let inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape); + + // Create a state for tracking position within the hint + let mut state = State { + peekable: hint_content.chars().peekable(), + line: span.start.line, + col: span.start.column, + }; + // Tokenize the hint content and add tokens to the buffer + let mut location = state.location(); + while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? { + let token_span = location.span_to(state.location()); + buf.push(TokenWithSpan { + token, + span: token_span, + }); location = state.location(); } + Ok(()) } @@ -2233,7 +2288,6 @@ impl<'a> Tokenizer<'a> { let mut s = String::new(); let mut nested = 1; let supports_nested_comments = self.dialect.supports_nested_comments(); - loop { match chars.next() { Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => { @@ -4218,6 +4272,88 @@ mod tests { Token::Whitespace(Whitespace::Space), Token::make_word("y", None), ], - ) + ); + } + + #[test] + fn tokenize_multiline_comment_with_comment_hint() { + let sql = String::from("0/*! word */1"); + + let dialect = MySqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + let expected = vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Word(Word { + value: "word".to_string(), + quote_style: None, + keyword: Keyword::NoKeyword, + }), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + ]; + compare(expected, tokens); + } + + #[test] + fn tokenize_multiline_comment_with_comment_hint_and_version() { + let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1"); + let dialect = MySqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap(); + let expected = vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Word(Word { + value: "KEY_BLOCK_SIZE".to_string(), + quote_style: None, + keyword: Keyword::KEY_BLOCK_SIZE, + }), + Token::Whitespace(Whitespace::Space), + Token::Eq, + Token::Whitespace(Whitespace::Space), + Token::Number("1024".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + ]; + compare(expected, tokens); + + let tokens = Tokenizer::new(&dialect, "0 /*!50110 */ 1") + .tokenize() + .unwrap(); + compare( + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + ], + tokens, + ); + + let tokens = Tokenizer::new(&dialect, "0 /*!*/ 1").tokenize().unwrap(); + compare( + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + ], + tokens, + ); + let tokens = Tokenizer::new(&dialect, "0 /*! */ 1").tokenize().unwrap(); + compare( + vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + ], + tokens, + ); } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
