mvzink commented on code in PR #1747: URL: https://github.com/apache/datafusion-sqlparser-rs/pull/1747#discussion_r2011090576
########## src/dialect/mysql.rs: ########## @@ -145,6 +153,280 @@ impl Dialect for MySqlDialect { fn supports_comma_separated_set_assignments(&self) -> bool { true } + /// Dialect-specific table/view option parser override + /// + /// This method is called to parse the next table/view option. + /// + /// If `None` is returned, falls back to the default behavior. + /// <https://dev.mysql.com/doc/refman/8.4/en/create-table.html> + fn parse_plain_option(&self, parser: &mut Parser) -> Result<Option<SqlOption>, ParserError> { + //Some consts + const COMPRESSION_OPTS: [&str; 3] = ["ZLIB", "LZ4", "NONE"]; + const INSERT_METHODS_OPTS: [&str; 3] = ["NO", "FIRST", "LAST"]; + + let keyword = match parser.parse_one_of_keywords(&[ + Keyword::INSERT_METHOD, + Keyword::KEY_BLOCK_SIZE, + Keyword::ROW_FORMAT, + Keyword::DATA, + Keyword::INDEX, + Keyword::PACK_KEYS, + Keyword::STATS_AUTO_RECALC, + Keyword::STATS_PERSISTENT, + Keyword::STATS_SAMPLE_PAGES, + Keyword::DELAY_KEY_WRITE, + Keyword::COMPRESSION, + Keyword::ENCRYPTION, + Keyword::MAX_ROWS, + Keyword::MIN_ROWS, + Keyword::AUTOEXTEND_SIZE, + Keyword::AVG_ROW_LENGTH, + Keyword::CHECKSUM, + Keyword::CONNECTION, + Keyword::ENGINE_ATTRIBUTE, + Keyword::PASSWORD, + Keyword::SECONDARY_ENGINE_ATTRIBUTE, + Keyword::START, + Keyword::TABLESPACE, + Keyword::UNION, + ]) { + Some(keyword) => keyword, + None => return Ok(None), + }; + + //Optional equal sign + let _ = parser.consume_token(&Token::Eq); + let value = parser.next_token(); + let span = value.span; + + match (keyword, value.token) { + //Handled in common code + // ENGINE [=] engine_name + // AUTO_INCREMENT [=] value + // [DEFAULT] CHARACTER SET [=] charset_name + // [DEFAULT] COLLATE [=] collation_name + + // KEY_BLOCK_SIZE [=] value + (Keyword::KEY_BLOCK_SIZE, Token::Number(n, l)) => Ok(Some(SqlOption::KeyValue { + key: Ident::new("KEY_BLOCK_SIZE"), + value: Expr::value(Value::Number(Parser::parse(n, value.span.start)?, l)), + })), + + // ROW_FORMAT [=] {DEFAULT | DYNAMIC | FIXED | COMPRESSED | REDUNDANT | COMPACT} + (Keyword::ROW_FORMAT, Token::Word(w)) => Ok(Some(SqlOption::KeyValue { + key: Ident::new("ROW_FORMAT"), + value: Expr::Identifier(Ident::with_span(span, w.value)), + })), + + // {DATA | INDEX} DIRECTORY [=] 'absolute path to directory' + (Keyword::DATA, Token::Word(w)) if w.keyword == Keyword::DIRECTORY => { + let _ = parser.consume_token(&Token::Eq); + + let next_token = parser.next_token(); + + if let Token::SingleQuotedString(s) = next_token.token { + Ok(Some(SqlOption::KeyValue { + key: Ident::new("DATA DIRECTORY"), + value: Expr::value(Value::SingleQuotedString(s)), + })) + } else { + parser.expected("Token::SingleQuotedString", next_token)? + } + } + (Keyword::INDEX, Token::Word(w)) if w.keyword == Keyword::DIRECTORY => { + let _ = parser.consume_token(&Token::Eq); + let next_token = parser.next_token(); + + if let Token::SingleQuotedString(s) = next_token.token { + Ok(Some(SqlOption::KeyValue { + key: Ident::new("INDEX DIRECTORY"), + value: Expr::value(Value::SingleQuotedString(s)), + })) + } else { + parser.expected("Token::SingleQuotedString", next_token)? + } + } + + // PACK_KEYS [=] {0 | 1 | DEFAULT} + (Keyword::PACK_KEYS, Token::Number(n, l)) => Ok(Some(SqlOption::KeyValue { + key: Ident::new("PACK_KEYS"), + value: Expr::value(Value::Number(Parser::parse(n, value.span.start)?, l)), + })), + + (Keyword::PACK_KEYS, Token::Word(s)) if s.value.to_uppercase() == "DEFAULT" => { + Ok(Some(SqlOption::KeyValue { + key: Ident::new("PACK_KEYS"), + value: Expr::value(Value::SingleQuotedString(s.value)), + })) + } + + // STATS_AUTO_RECALC [=] {DEFAULT | 0 | 1} + (Keyword::STATS_AUTO_RECALC, Token::Number(n, l)) => Ok(Some(SqlOption::KeyValue { + key: Ident::new("STATS_AUTO_RECALC"), + value: Expr::value(Value::Number(Parser::parse(n, value.span.start)?, l)), + })), + + (Keyword::STATS_AUTO_RECALC, Token::Word(s)) if s.value.to_uppercase() == "DEFAULT" => { + Ok(Some(SqlOption::KeyValue { + key: Ident::new("STATS_AUTO_RECALC"), + value: Expr::value(Value::SingleQuotedString(s.value)), + })) + } + + //STATS_PERSISTENT [=] {DEFAULT | 0 | 1} + (Keyword::STATS_PERSISTENT, Token::Number(n, l)) => Ok(Some(SqlOption::KeyValue { + key: Ident::new("STATS_PERSISTENT"), + value: Expr::value(Value::Number(Parser::parse(n, value.span.start)?, l)), + })), + + (Keyword::STATS_PERSISTENT, Token::Word(s)) if s.value.to_uppercase() == "DEFAULT" => { + Ok(Some(SqlOption::KeyValue { + key: Ident::new("STATS_PERSISTENT"), + value: Expr::value(Value::SingleQuotedString(s.value)), + })) + } + + // STATS_SAMPLE_PAGES [=] value + (Keyword::STATS_SAMPLE_PAGES, Token::Number(n, l)) => Ok(Some(SqlOption::KeyValue { + key: Ident::new("STATS_SAMPLE_PAGES"), + value: Expr::value(Value::Number(Parser::parse(n, value.span.start)?, l)), + })), + + // DELAY_KEY_WRITE [=] {0 | 1} + (Keyword::DELAY_KEY_WRITE, Token::Number(n, l)) => Ok(Some(SqlOption::KeyValue { + key: Ident::new("DELAY_KEY_WRITE"), + value: Expr::value(Value::Number(Parser::parse(n, value.span.start)?, l)), + })), + + // COMPRESSION [=] {'ZLIB' | 'LZ4' | 'NONE'} + (Keyword::COMPRESSION, Token::SingleQuotedString(s)) + if COMPRESSION_OPTS.contains(&s.to_uppercase().as_str()) => Review Comment: This is fine but honestly I would skip this check entirely for reasons in a similar vein to what I mentioned in my other comment about dialect overrides. Since they are all single quoted strings anyway, it's not really a *syntactic* question as to which compression algorithms are allowed, but a semantic one. What if I were writing a MySQL-compatible DBMS with added compression options? Are you really going to make me override and copy and paste this method when the syntax is identical but the semantic content has changed? It's also a maintainability hazard if MySQL (or any MySQL-compatible system that users are parsing with `MySqlDialect`) adds a new compression algorithm. I think I feel the same way about `INSERT_METHODS_OPTS` but it's a tad less clear since those are identifiers not strings. If you are gonna keep these checks, I would inline the arrays instead of defining them at the top of the function, to keep that info local to where it's used. ########## src/dialect/mysql.rs: ########## @@ -141,6 +149,280 @@ impl Dialect for MySqlDialect { fn supports_set_names(&self) -> bool { true } + /// Dialect-specific table/view option parser override + /// + /// This method is called to parse the next table/view option. + /// + /// If `None` is returned, falls back to the default behavior. + /// <https://dev.mysql.com/doc/refman/8.4/en/create-table.html> + fn parse_plain_option(&self, parser: &mut Parser) -> Result<Option<SqlOption>, ParserError> { + //Some consts + const COMPRESSION_OPTS: [&str; 3] = ["ZLIB", "LZ4", "NONE"]; + const INSERT_METHODS_OPTS: [&str; 3] = ["NO", "FIRST", "LAST"]; + + let keyword = match parser.parse_one_of_keywords(&[ Review Comment: I philosophically agree with @iffyio's ask here, as I mention in my other big ranty comment (sorry 😛). However, I am not 100% convinced in practice, mostly because these options aren't comma separated and this could produce some really confusing parses if users make errors. I imagined the default parsing routine would be "parse ident, optionally parse equals sign, parse expr". But suppose a user intended to send `ROW_FORMAT COMPACT DEFAULT CHARACTER SET latin1`, but accidentally deleted `COMPACT`. Instead of a parse error like they would get if options were comma separated or hardcoded, they get a successful parse with `ROW_FORMAT = DEFAULT` followed by `CHARACTER SET = latin1`. Is it our fault? Not really. Is it better than the maintenance burden of keeping a list of valid one-word option names up to date? I'm honestly not sure. ########## src/parser/mod.rs: ########## @@ -6928,13 +6874,122 @@ impl<'a> Parser<'a> { }; } + let plain_options = self.parse_plain_options()?; + Ok(CreateTableConfiguration { partition_by, cluster_by, options, + plain_options, }) } + fn parse_plain_option(&mut self) -> Result<Option<SqlOption>, ParserError> { + if let Some(option) = self.dialect.parse_plain_option(self)? { + return Ok(Some(option)); + } + + // Parse table options in any order + if let Some(keyword) = self.parse_one_of_keywords(&[ + Keyword::ENGINE, + Keyword::AUTO_INCREMENT, + Keyword::DEFAULT, + Keyword::CHARSET, + Keyword::COLLATE, + Keyword::INSERT_METHOD, + ]) { Review Comment: You're right, that's a bigger question. I don't have a strong suggestion at the moment and this certainly seems servicable to me. My 2¢ on the bigger question is that introducing more dialect overrides (and actually this is more similar to `parse_column_option`, which is global, than Snowflake's `parse_create_table` which is only called from `SnowflakeDialect::parse_statement`, the normal top-level override) is a slippery slope toward fully dynamically dispatched parsing, but it's more justifiable if it's parsing something with a substantially different syntactic structure. However, this is *mostly* just parsing a different list of acceptable keywords but doing it in the same way. In such cases, I personally think more permissive generic parsing is preferable; e.g. does it really matter if `SnowflakeDialect` accepts `ROW_FORMAT = COMPACT` even though it doesn't make sense, so long as it doesn't introduce syntactic ambiguity? The exceptions of special parsing routines for a few op tions like `DATA DIRECTORY` and `TABLESPACE` are a good argument though. If anything, I would probably favor putting all the not-so-special options in the default implementation and only using the override for these options that require special parsing. Honestly, if it wouldn't introduce ambiguity (mostly due to options not being comma separated), I would probably even prefer the default/fallback routine for non-special options just be "parse ident, optionally parse equals sign, parse expr"—that's all thats's going to end up in the `SqlOption::KeyValue` at the end anyway, so why do we care which specific list of idents are acceptable? That is more of a semantic than syntactic question IMO. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org