(datafusion-sqlparser-rs) 01/01: Databricks: Add support for `OPTIMIZE`, `PARTITIONED BY`, and `STRUCT` (#2170)

github-bot Mon, 09 Feb 2026 01:23:36 -0800

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch 
gh-readonly-queue/main/pr-2170-d0a0b3e11b2b18804f0a9ef2c2055de929ccfcf3
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git


commit d52681969e43cf0d66ea9c4e09fc32beb8d17ad2
Author: Minjun Kim <[email protected]>
AuthorDate: Mon Feb 9 18:23:04 2026 +0900

    Databricks: Add support for `OPTIMIZE`, `PARTITIONED BY`, and `STRUCT` 
(#2170)
---
 src/ast/mod.rs                |  35 +++++++-
 src/dialect/databricks.rs     |   5 ++
 src/parser/mod.rs             |  63 ++++++++++++--
 tests/sqlparser_databricks.rs | 198 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 290 insertions(+), 11 deletions(-)

diff --git a/src/ast/mod.rs b/src/ast/mod.rs
index 010a8189..cc34ec7a 100644
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@@ -4578,22 +4578,40 @@ pub enum Statement {
         /// Legacy copy-style options.
         options: Vec<CopyLegacyOption>,
     },
+    /// ClickHouse:
     /// ```sql
     /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | 
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
     /// ```
-    ///
     /// See ClickHouse 
<https://clickhouse.com/docs/en/sql-reference/statements/optimize>
+    ///
+    /// Databricks:
+    /// ```sql
+    /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+    /// ```
+    /// See Databricks 
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
     OptimizeTable {
         /// Table name to optimize.
         name: ObjectName,
+        /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE 
TABLE`, Databricks uses `OPTIMIZE`).
+        has_table_keyword: bool,
         /// Optional cluster identifier.
+        /// 
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
         on_cluster: Option<Ident>,
         /// Optional partition spec.
+        /// 
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
         partition: Option<Partition>,
         /// Whether `FINAL` was specified.
+        /// 
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
         include_final: bool,
         /// Optional deduplication settings.
+        /// 
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
         deduplicate: Option<Deduplicate>,
+        /// Optional WHERE predicate.
+        /// 
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
+        predicate: Option<Expr>,
+        /// Optional ZORDER BY columns.
+        /// 
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
+        zorder: Option<Vec<Expr>>,
     },
     /// ```sql
     /// LISTEN
@@ -6069,12 +6087,19 @@ impl fmt::Display for Statement {
             }
             Statement::OptimizeTable {
                 name,
+                has_table_keyword,
                 on_cluster,
                 partition,
                 include_final,
                 deduplicate,
+                predicate,
+                zorder,
             } => {
-                write!(f, "OPTIMIZE TABLE {name}")?;
+                write!(f, "OPTIMIZE")?;
+                if *has_table_keyword {
+                    write!(f, " TABLE")?;
+                }
+                write!(f, " {name}")?;
                 if let Some(on_cluster) = on_cluster {
                     write!(f, " ON CLUSTER {on_cluster}")?;
                 }
@@ -6087,6 +6112,12 @@ impl fmt::Display for Statement {
                 if let Some(deduplicate) = deduplicate {
                     write!(f, " {deduplicate}")?;
                 }
+                if let Some(predicate) = predicate {
+                    write!(f, " WHERE {predicate}")?;
+                }
+                if let Some(zorder) = zorder {
+                    write!(f, " ZORDER BY ({})", 
display_comma_separated(zorder))?;
+                }
                 Ok(())
             }
             Statement::LISTEN { channel } => {
diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs
index e903b073..55e4f56c 100644
--- a/src/dialect/databricks.rs
+++ b/src/dialect/databricks.rs
@@ -85,4 +85,9 @@ impl Dialect for DatabricksDialect {
     fn supports_values_as_table_factor(&self) -> bool {
         true
     }
+
+    /// See 
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
+    fn supports_optimize_table(&self) -> bool {
+        true
+    }
 }
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 0b152f2b..c40ed427 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -693,7 +693,6 @@ impl<'a> Parser<'a> {
                 // `INSTALL` is duckdb specific 
https://duckdb.org/docs/extensions/overview
                 Keyword::INSTALL if self.dialect.supports_install() => 
self.parse_install(),
                 Keyword::LOAD => self.parse_load(),
-                // `OPTIMIZE` is clickhouse specific 
https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
                 Keyword::OPTIMIZE if self.dialect.supports_optimize_table() => 
{
                     self.parse_optimize_table()
                 }
@@ -3382,24 +3381,28 @@ impl<'a> Parser<'a> {
     ///
     /// ```sql
     /// [field_name] field_type
+    /// field_name: field_type
     /// ```
     ///
     /// [struct]: 
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type
     /// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple
+    /// [databricks]: 
https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
     fn parse_struct_field_def(
         &mut self,
     ) -> Result<(StructField, MatchedTrailingBracket), ParserError> {
         // Look beyond the next item to infer whether both field name
         // and type are specified.
-        let is_anonymous_field = !matches!(
+        let is_named_field = matches!(
             (self.peek_nth_token(0).token, self.peek_nth_token(1).token),
-            (Token::Word(_), Token::Word(_))
+            (Token::Word(_), Token::Word(_)) | (Token::Word(_), Token::Colon)
         );
 
-        let field_name = if is_anonymous_field {
-            None
+        let field_name = if is_named_field {
+            let name = self.parse_identifier()?;
+            let _ = self.consume_token(&Token::Colon);
+            Some(name)
         } else {
-            Some(self.parse_identifier()?)
+            None
         };
 
         let (field_type, trailing_bracket) = self.parse_data_type_helper()?;
@@ -7985,7 +7988,8 @@ impl<'a> Parser<'a> {
     pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, 
ParserError> {
         if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
             self.expect_token(&Token::LParen)?;
-            let columns = 
self.parse_comma_separated(Parser::parse_column_def)?;
+            let columns =
+                self.parse_comma_separated(|parser| 
parser.parse_column_def_inner(true))?;
             self.expect_token(&Token::RParen)?;
             Ok(HiveDistributionStyle::PARTITIONED { columns })
         } else {
@@ -8809,9 +8813,19 @@ impl<'a> Parser<'a> {
 
     /// Parse column definition.
     pub fn parse_column_def(&mut self) -> Result<ColumnDef, ParserError> {
+        self.parse_column_def_inner(false)
+    }
+
+    fn parse_column_def_inner(
+        &mut self,
+        optional_data_type: bool,
+    ) -> Result<ColumnDef, ParserError> {
         let col_name = self.parse_identifier()?;
         let data_type = if self.is_column_type_sqlite_unspecified() {
             DataType::Unspecified
+        } else if optional_data_type {
+            self.maybe_parse(|parser| parser.parse_data_type())?
+                .unwrap_or(DataType::Unspecified)
         } else {
             self.parse_data_type()?
         };
@@ -11917,7 +11931,8 @@ impl<'a> Parser<'a> {
                     let field_defs = self.parse_duckdb_struct_type_def()?;
                     Ok(DataType::Struct(field_defs, 
StructBracketKind::Parentheses))
                 }
-                Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | 
GenericDialect) => {
+                Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | 
DatabricksDialect | GenericDialect) =>
+                {
                     self.prev_token();
                     let (field_defs, _trailing_bracket) =
                         
self.parse_struct_type_def(Self::parse_struct_field_def)?;
@@ -18480,13 +18495,23 @@ impl<'a> Parser<'a> {
         }
     }
 
+    /// ClickHouse:
     /// ```sql
     /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | 
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
     /// ```
     /// 
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
+    ///
+    /// Databricks:
+    /// ```sql
+    /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+    /// ```
+    /// 
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
     pub fn parse_optimize_table(&mut self) -> Result<Statement, ParserError> {
-        self.expect_keyword_is(Keyword::TABLE)?;
+        let has_table_keyword = self.parse_keyword(Keyword::TABLE);
+
         let name = self.parse_object_name(false)?;
+
+        // ClickHouse-specific options
         let on_cluster = self.parse_optional_on_cluster()?;
 
         let partition = if self.parse_keyword(Keyword::PARTITION) {
@@ -18500,6 +18525,7 @@ impl<'a> Parser<'a> {
         };
 
         let include_final = self.parse_keyword(Keyword::FINAL);
+
         let deduplicate = if self.parse_keyword(Keyword::DEDUPLICATE) {
             if self.parse_keyword(Keyword::BY) {
                 Some(Deduplicate::ByExpression(self.parse_expr()?))
@@ -18510,12 +18536,31 @@ impl<'a> Parser<'a> {
             None
         };
 
+        // Databricks-specific options
+        let predicate = if self.parse_keyword(Keyword::WHERE) {
+            Some(self.parse_expr()?)
+        } else {
+            None
+        };
+
+        let zorder = if self.parse_keywords(&[Keyword::ZORDER, Keyword::BY]) {
+            self.expect_token(&Token::LParen)?;
+            let columns = self.parse_comma_separated(|p| p.parse_expr())?;
+            self.expect_token(&Token::RParen)?;
+            Some(columns)
+        } else {
+            None
+        };
+
         Ok(Statement::OptimizeTable {
             name,
+            has_table_keyword,
             on_cluster,
             partition,
             include_final,
             deduplicate,
+            predicate,
+            zorder,
         })
     }
 
diff --git a/tests/sqlparser_databricks.rs b/tests/sqlparser_databricks.rs
index b088afd7..6a7534ad 100644
--- a/tests/sqlparser_databricks.rs
+++ b/tests/sqlparser_databricks.rs
@@ -390,3 +390,201 @@ fn parse_table_time_travel() {
         .parse_sql_statements("SELECT 1 FROM t1 VERSION AS OF 1 - 2",)
         .is_err())
 }
+
+#[test]
+fn parse_optimize_table() {
+    // Basic OPTIMIZE (Databricks style - no TABLE keyword)
+    databricks().verified_stmt("OPTIMIZE my_table");
+    databricks().verified_stmt("OPTIMIZE db.my_table");
+    databricks().verified_stmt("OPTIMIZE catalog.db.my_table");
+
+    // With WHERE clause
+    databricks().verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01'");
+    databricks()
+        .verified_stmt("OPTIMIZE my_table WHERE date >= '2023-01-01' AND date 
< '2023-02-01'");
+
+    // With ZORDER BY clause
+    databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1)");
+    databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1, col2)");
+    databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1, col2, 
col3)");
+
+    // Combined WHERE and ZORDER BY
+    databricks().verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01' 
ZORDER BY (col1)");
+    databricks()
+        .verified_stmt("OPTIMIZE my_table WHERE date >= '2023-01-01' ZORDER BY 
(col1, col2)");
+
+    // Verify AST structure
+    match databricks()
+        .verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01' ZORDER BY 
(col1, col2)")
+    {
+        Statement::OptimizeTable {
+            name,
+            has_table_keyword,
+            on_cluster,
+            partition,
+            include_final,
+            deduplicate,
+            predicate,
+            zorder,
+        } => {
+            assert_eq!(name.to_string(), "my_table");
+            assert!(!has_table_keyword);
+            assert!(on_cluster.is_none());
+            assert!(partition.is_none());
+            assert!(!include_final);
+            assert!(deduplicate.is_none());
+            assert!(predicate.is_some());
+            assert_eq!(
+                zorder,
+                Some(vec![
+                    Expr::Identifier(Ident::new("col1")),
+                    Expr::Identifier(Ident::new("col2")),
+                ])
+            );
+        }
+        _ => unreachable!(),
+    }
+
+    // Negative cases
+    assert_eq!(
+        databricks()
+            .parse_sql_statements("OPTIMIZE my_table ZORDER BY")
+            .unwrap_err(),
+        ParserError::ParserError("Expected: (, found: EOF".to_string())
+    );
+    assert_eq!(
+        databricks()
+            .parse_sql_statements("OPTIMIZE my_table ZORDER BY ()")
+            .unwrap_err(),
+        ParserError::ParserError("Expected: an expression, found: 
)".to_string())
+    );
+}
+
+#[test]
+fn parse_create_table_partitioned_by() {
+    // Databricks allows PARTITIONED BY with just column names (referencing 
existing columns)
+    // 
https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html
+
+    // Single partition column without type
+    databricks().verified_stmt("CREATE TABLE t (col1 STRING, col2 INT) 
PARTITIONED BY (col1)");
+
+    // Multiple partition columns without types
+    databricks().verified_stmt(
+        "CREATE TABLE t (col1 STRING, col2 INT, col3 DATE) PARTITIONED BY 
(col1, col2)",
+    );
+
+    // Partition columns with types (new columns not in table spec)
+    databricks().verified_stmt("CREATE TABLE t (name STRING) PARTITIONED BY 
(year INT, month INT)");
+
+    // Mixed: some with types, some without
+    databricks()
+        .verified_stmt("CREATE TABLE t (id INT, name STRING) PARTITIONED BY 
(region, year INT)");
+
+    // Verify AST structure for column without type
+    match databricks().verified_stmt("CREATE TABLE t (col1 STRING) PARTITIONED 
BY (col1)") {
+        Statement::CreateTable(CreateTable {
+            name,
+            columns,
+            hive_distribution,
+            ..
+        }) => {
+            assert_eq!(name.to_string(), "t");
+            assert_eq!(columns.len(), 1);
+            assert_eq!(columns[0].name.to_string(), "col1");
+            match hive_distribution {
+                HiveDistributionStyle::PARTITIONED {
+                    columns: partition_cols,
+                } => {
+                    assert_eq!(partition_cols.len(), 1);
+                    assert_eq!(partition_cols[0].name.to_string(), "col1");
+                    assert_eq!(partition_cols[0].data_type, 
DataType::Unspecified);
+                }
+                _ => unreachable!(),
+            }
+        }
+        _ => unreachable!(),
+    }
+
+    // Verify AST structure for column with type
+    match databricks().verified_stmt("CREATE TABLE t (name STRING) PARTITIONED 
BY (year INT)") {
+        Statement::CreateTable(CreateTable {
+            hive_distribution:
+                HiveDistributionStyle::PARTITIONED {
+                    columns: partition_cols,
+                },
+            ..
+        }) => {
+            assert_eq!(partition_cols.len(), 1);
+            assert_eq!(partition_cols[0].name.to_string(), "year");
+            assert_eq!(partition_cols[0].data_type, DataType::Int(None));
+        }
+        _ => unreachable!(),
+    }
+}
+
+#[test]
+fn parse_databricks_struct_type() {
+    // Databricks uses colon-separated struct field syntax (colon is optional)
+    // 
https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
+
+    // Basic struct with colon syntax - parses to canonical form without colons
+    databricks().one_statement_parses_to(
+        "CREATE TABLE t (col1 STRUCT<field1: STRING, field2: INT>)",
+        "CREATE TABLE t (col1 STRUCT<field1 STRING, field2 INT>)",
+    );
+
+    // Nested array of struct (the original issue case)
+    databricks().one_statement_parses_to(
+        "CREATE TABLE t (col1 ARRAY<STRUCT<finish_flag: STRING, survive_flag: 
STRING, score: INT>>)",
+        "CREATE TABLE t (col1 ARRAY<STRUCT<finish_flag STRING, survive_flag 
STRING, score INT>>)",
+    );
+
+    // Multiple struct columns
+    databricks().one_statement_parses_to(
+        "CREATE TABLE t (col1 STRUCT<a: INT, b: STRING>, col2 STRUCT<x: 
DOUBLE>)",
+        "CREATE TABLE t (col1 STRUCT<a INT, b STRING>, col2 STRUCT<x DOUBLE>)",
+    );
+
+    // Deeply nested structs
+    databricks().one_statement_parses_to(
+        "CREATE TABLE t (col1 STRUCT<outer: STRUCT<inner: STRING>>)",
+        "CREATE TABLE t (col1 STRUCT<outer STRUCT<inner STRING>>)",
+    );
+
+    // Struct with array field
+    databricks().one_statement_parses_to(
+        "CREATE TABLE t (col1 STRUCT<items: ARRAY<INT>, name: STRING>)",
+        "CREATE TABLE t (col1 STRUCT<items ARRAY<INT>, name STRING>)",
+    );
+
+    // Syntax without colons should also work (BigQuery compatible)
+    databricks().verified_stmt("CREATE TABLE t (col1 STRUCT<field1 STRING, 
field2 INT>)");
+
+    // Verify AST structure
+    match databricks().one_statement_parses_to(
+        "CREATE TABLE t (col1 STRUCT<field1: STRING, field2: INT>)",
+        "CREATE TABLE t (col1 STRUCT<field1 STRING, field2 INT>)",
+    ) {
+        Statement::CreateTable(CreateTable { columns, .. }) => {
+            assert_eq!(columns.len(), 1);
+            assert_eq!(columns[0].name.to_string(), "col1");
+            match &columns[0].data_type {
+                DataType::Struct(fields, StructBracketKind::AngleBrackets) => {
+                    assert_eq!(fields.len(), 2);
+                    assert_eq!(
+                        fields[0].field_name.as_ref().map(|i| i.to_string()),
+                        Some("field1".to_string())
+                    );
+                    assert_eq!(fields[0].field_type, DataType::String(None));
+                    assert_eq!(
+                        fields[1].field_name.as_ref().map(|i| i.to_string()),
+                        Some("field2".to_string())
+                    );
+                    assert_eq!(fields[1].field_type, DataType::Int(None));
+                }
+                _ => unreachable!(),
+            }
+        }
+        _ => unreachable!(),
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion-sqlparser-rs) 01/01: Databricks: Add support for `OPTIMIZE`, `PARTITIONED BY`, and `STRUCT` (#2170)

Reply via email to