Re: [PR] Add all missing table options to be handled in any order [datafusion-sqlparser-rs]

via GitHub Mon, 24 Mar 2025 18:22:28 -0700


mvzink commented on code in PR #1747:
URL: 
https://github.com/apache/datafusion-sqlparser-rs/pull/1747#discussion_r2011090576



##########
src/dialect/mysql.rs:
##########
@@ -145,6 +153,280 @@ impl Dialect for MySqlDialect {
     fn supports_comma_separated_set_assignments(&self) -> bool {
         true
     }
+    /// Dialect-specific table/view option parser override
+    ///
+    /// This method is called to parse the next table/view option.
+    ///
+    /// If `None` is returned, falls back to the default behavior.
+    /// <https://dev.mysql.com/doc/refman/8.4/en/create-table.html>
+    fn parse_plain_option(&self, parser: &mut Parser) -> 
Result<Option<SqlOption>, ParserError> {
+        //Some consts
+        const COMPRESSION_OPTS: [&str; 3] = ["ZLIB", "LZ4", "NONE"];
+        const INSERT_METHODS_OPTS: [&str; 3] = ["NO", "FIRST", "LAST"];
+
+        let keyword = match parser.parse_one_of_keywords(&[
+            Keyword::INSERT_METHOD,
+            Keyword::KEY_BLOCK_SIZE,
+            Keyword::ROW_FORMAT,
+            Keyword::DATA,
+            Keyword::INDEX,
+            Keyword::PACK_KEYS,
+            Keyword::STATS_AUTO_RECALC,
+            Keyword::STATS_PERSISTENT,
+            Keyword::STATS_SAMPLE_PAGES,
+            Keyword::DELAY_KEY_WRITE,
+            Keyword::COMPRESSION,
+            Keyword::ENCRYPTION,
+            Keyword::MAX_ROWS,
+            Keyword::MIN_ROWS,
+            Keyword::AUTOEXTEND_SIZE,
+            Keyword::AVG_ROW_LENGTH,
+            Keyword::CHECKSUM,
+            Keyword::CONNECTION,
+            Keyword::ENGINE_ATTRIBUTE,
+            Keyword::PASSWORD,
+            Keyword::SECONDARY_ENGINE_ATTRIBUTE,
+            Keyword::START,
+            Keyword::TABLESPACE,
+            Keyword::UNION,
+        ]) {
+            Some(keyword) => keyword,
+            None => return Ok(None),
+        };
+
+        //Optional equal sign
+        let _ = parser.consume_token(&Token::Eq);
+        let value = parser.next_token();
+        let span = value.span;
+
+        match (keyword, value.token) {
+            //Handled in common code
+            // ENGINE [=] engine_name
+            // AUTO_INCREMENT [=] value
+            // [DEFAULT] CHARACTER SET [=] charset_name
+            // [DEFAULT] COLLATE [=] collation_name
+
+            // KEY_BLOCK_SIZE [=] value
+            (Keyword::KEY_BLOCK_SIZE, Token::Number(n, l)) => 
Ok(Some(SqlOption::KeyValue {
+                key: Ident::new("KEY_BLOCK_SIZE"),
+                value: Expr::value(Value::Number(Parser::parse(n, 
value.span.start)?, l)),
+            })),
+
+            // ROW_FORMAT [=] {DEFAULT | DYNAMIC | FIXED | COMPRESSED | 
REDUNDANT | COMPACT}
+            (Keyword::ROW_FORMAT, Token::Word(w)) => 
Ok(Some(SqlOption::KeyValue {
+                key: Ident::new("ROW_FORMAT"),
+                value: Expr::Identifier(Ident::with_span(span, w.value)),
+            })),
+
+            // {DATA | INDEX} DIRECTORY [=] 'absolute path to directory'
+            (Keyword::DATA, Token::Word(w)) if w.keyword == Keyword::DIRECTORY 
=> {
+                let _ = parser.consume_token(&Token::Eq);
+
+                let next_token = parser.next_token();
+
+                if let Token::SingleQuotedString(s) = next_token.token {
+                    Ok(Some(SqlOption::KeyValue {
+                        key: Ident::new("DATA DIRECTORY"),
+                        value: Expr::value(Value::SingleQuotedString(s)),
+                    }))
+                } else {
+                    parser.expected("Token::SingleQuotedString", next_token)?
+                }
+            }
+            (Keyword::INDEX, Token::Word(w)) if w.keyword == 
Keyword::DIRECTORY => {
+                let _ = parser.consume_token(&Token::Eq);
+                let next_token = parser.next_token();
+
+                if let Token::SingleQuotedString(s) = next_token.token {
+                    Ok(Some(SqlOption::KeyValue {
+                        key: Ident::new("INDEX DIRECTORY"),
+                        value: Expr::value(Value::SingleQuotedString(s)),
+                    }))
+                } else {
+                    parser.expected("Token::SingleQuotedString", next_token)?
+                }
+            }
+
+            // PACK_KEYS [=] {0 | 1 | DEFAULT}
+            (Keyword::PACK_KEYS, Token::Number(n, l)) => 
Ok(Some(SqlOption::KeyValue {
+                key: Ident::new("PACK_KEYS"),
+                value: Expr::value(Value::Number(Parser::parse(n, 
value.span.start)?, l)),
+            })),
+
+            (Keyword::PACK_KEYS, Token::Word(s)) if s.value.to_uppercase() == 
"DEFAULT" => {
+                Ok(Some(SqlOption::KeyValue {
+                    key: Ident::new("PACK_KEYS"),
+                    value: Expr::value(Value::SingleQuotedString(s.value)),
+                }))
+            }
+
+            // STATS_AUTO_RECALC [=] {DEFAULT | 0 | 1}
+            (Keyword::STATS_AUTO_RECALC, Token::Number(n, l)) => 
Ok(Some(SqlOption::KeyValue {
+                key: Ident::new("STATS_AUTO_RECALC"),
+                value: Expr::value(Value::Number(Parser::parse(n, 
value.span.start)?, l)),
+            })),
+
+            (Keyword::STATS_AUTO_RECALC, Token::Word(s)) if 
s.value.to_uppercase() == "DEFAULT" => {
+                Ok(Some(SqlOption::KeyValue {
+                    key: Ident::new("STATS_AUTO_RECALC"),
+                    value: Expr::value(Value::SingleQuotedString(s.value)),
+                }))
+            }
+
+            //STATS_PERSISTENT [=] {DEFAULT | 0 | 1}
+            (Keyword::STATS_PERSISTENT, Token::Number(n, l)) => 
Ok(Some(SqlOption::KeyValue {
+                key: Ident::new("STATS_PERSISTENT"),
+                value: Expr::value(Value::Number(Parser::parse(n, 
value.span.start)?, l)),
+            })),
+
+            (Keyword::STATS_PERSISTENT, Token::Word(s)) if 
s.value.to_uppercase() == "DEFAULT" => {
+                Ok(Some(SqlOption::KeyValue {
+                    key: Ident::new("STATS_PERSISTENT"),
+                    value: Expr::value(Value::SingleQuotedString(s.value)),
+                }))
+            }
+
+            // STATS_SAMPLE_PAGES [=] value
+            (Keyword::STATS_SAMPLE_PAGES, Token::Number(n, l)) => 
Ok(Some(SqlOption::KeyValue {
+                key: Ident::new("STATS_SAMPLE_PAGES"),
+                value: Expr::value(Value::Number(Parser::parse(n, 
value.span.start)?, l)),
+            })),
+
+            // DELAY_KEY_WRITE [=] {0 | 1}
+            (Keyword::DELAY_KEY_WRITE, Token::Number(n, l)) => 
Ok(Some(SqlOption::KeyValue {
+                key: Ident::new("DELAY_KEY_WRITE"),
+                value: Expr::value(Value::Number(Parser::parse(n, 
value.span.start)?, l)),
+            })),
+
+            // COMPRESSION [=] {'ZLIB' | 'LZ4' | 'NONE'}
+            (Keyword::COMPRESSION, Token::SingleQuotedString(s))
+                if COMPRESSION_OPTS.contains(&s.to_uppercase().as_str()) =>

Review Comment:
   This is fine but honestly I would skip this check entirely for reasons in a 
similar vein to what I mentioned in my other comment about dialect overrides. 
Since they are all single quoted strings anyway, it's not really a *syntactic* 
question as to which compression algorithms are allowed, but a semantic one. 
What if I were writing a MySQL-compatible DBMS with added compression options? 
Are you really going to make me override and copy and paste this method when 
the syntax is identical but the semantic content has changed? It's also a 
maintainability hazard if MySQL (or any MySQL-compatible system that users are 
parsing with `MySqlDialect`) adds a new compression algorithm. I think I feel 
the same way about `INSERT_METHODS_OPTS` but it's a tad less clear since those 
are identifiers not strings.
   
   If you are gonna keep these checks, I would inline the arrays instead of 
defining them at the top of the function, to keep that info local to where it's 
used.



##########
src/dialect/mysql.rs:
##########
@@ -141,6 +149,280 @@ impl Dialect for MySqlDialect {
     fn supports_set_names(&self) -> bool {
         true
     }
+    /// Dialect-specific table/view option parser override
+    ///
+    /// This method is called to parse the next table/view option.
+    ///
+    /// If `None` is returned, falls back to the default behavior.
+    /// <https://dev.mysql.com/doc/refman/8.4/en/create-table.html>
+    fn parse_plain_option(&self, parser: &mut Parser) -> 
Result<Option<SqlOption>, ParserError> {
+        //Some consts
+        const COMPRESSION_OPTS: [&str; 3] = ["ZLIB", "LZ4", "NONE"];
+        const INSERT_METHODS_OPTS: [&str; 3] = ["NO", "FIRST", "LAST"];
+
+        let keyword = match parser.parse_one_of_keywords(&[

Review Comment:
   I philosophically agree with @iffyio's ask here, as I mention in my other 
big ranty comment (sorry 😛). However, I am not 100% convinced in practice, 
mostly because these options aren't comma separated and this could produce some 
really confusing parses if users make errors. I imagined the default parsing 
routine would be "parse ident, optionally parse equals sign, parse expr". But 
suppose a user intended to send `ROW_FORMAT COMPACT DEFAULT CHARACTER SET 
latin1`, but accidentally deleted `COMPACT`. Instead of a parse error like they 
would get if options were comma separated or hardcoded, they get a successful 
parse with `ROW_FORMAT = DEFAULT` followed by `CHARACTER SET = latin1`. Is it 
our fault? Not really. Is it better than the maintenance burden of keeping a 
list of valid one-word option names up to date? I'm honestly not sure.



##########
src/parser/mod.rs:
##########
@@ -6928,13 +6874,122 @@ impl<'a> Parser<'a> {
             };
         }
 
+        let plain_options = self.parse_plain_options()?;
+
         Ok(CreateTableConfiguration {
             partition_by,
             cluster_by,
             options,
+            plain_options,
         })
     }
 
+    fn parse_plain_option(&mut self) -> Result<Option<SqlOption>, ParserError> 
{
+        if let Some(option) = self.dialect.parse_plain_option(self)? {
+            return Ok(Some(option));
+        }
+
+        // Parse table options in any order
+        if let Some(keyword) = self.parse_one_of_keywords(&[
+            Keyword::ENGINE,
+            Keyword::AUTO_INCREMENT,
+            Keyword::DEFAULT,
+            Keyword::CHARSET,
+            Keyword::COLLATE,
+            Keyword::INSERT_METHOD,
+        ]) {

Review Comment:
   You're right, that's a bigger question. I don't have a strong suggestion at 
the moment and this certainly seems servicable to me. My 2¢ on the bigger 
question is that introducing more dialect overrides (and actually this is more 
similar to `parse_column_option`, which is global, than Snowflake's 
`parse_create_table` which is only called from 
`SnowflakeDialect::parse_statement`, the normal top-level override) is a 
slippery slope toward fully dynamically dispatched parsing, but it's more 
justifiable if it's parsing something with a substantially different syntactic 
structure. However, this is *mostly* just parsing a different list of 
acceptable keywords but doing it in the same way. In such cases, I personally 
think more permissive generic parsing is preferable; e.g. does it really matter 
if `SnowflakeDialect` accepts `ROW_FORMAT = COMPACT` even though it doesn't 
make sense, so long as it doesn't introduce syntactic ambiguity? The exceptions 
of special parsing routines for a few op
 tions like `DATA DIRECTORY` and `TABLESPACE` are a good argument though. If 
anything, I would probably favor putting all the not-so-special options in the 
default implementation and only using the override for these options that 
require special parsing. Honestly, if it wouldn't introduce ambiguity (mostly 
due to options not being comma separated), I would probably even prefer the 
default/fallback routine for non-special options just be "parse ident, 
optionally parse equals sign, parse expr"—that's all thats's going to end up in 
the `SqlOption::KeyValue` at the end anyway, so why do we care which specific 
list of idents are acceptable? That is more of a semantic than syntactic 
question IMO.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] Add all missing table options to be handled in any order [datafusion-sqlparser-rs]

Reply via email to