This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d5268196 Databricks: Add support for `OPTIMIZE`, `PARTITIONED BY`, and
`STRUCT` (#2170)
d5268196 is described below
commit d52681969e43cf0d66ea9c4e09fc32beb8d17ad2
Author: Minjun Kim <[email protected]>
AuthorDate: Mon Feb 9 18:23:04 2026 +0900
Databricks: Add support for `OPTIMIZE`, `PARTITIONED BY`, and `STRUCT`
(#2170)
---
src/ast/mod.rs | 35 +++++++-
src/dialect/databricks.rs | 5 ++
src/parser/mod.rs | 63 ++++++++++++--
tests/sqlparser_databricks.rs | 198 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 290 insertions(+), 11 deletions(-)
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
index 010a8189..cc34ec7a 100644
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@@ -4578,22 +4578,40 @@ pub enum Statement {
/// Legacy copy-style options.
options: Vec<CopyLegacyOption>,
},
+ /// ClickHouse:
/// ```sql
/// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition |
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
/// ```
- ///
/// See ClickHouse
<https://clickhouse.com/docs/en/sql-reference/statements/optimize>
+ ///
+ /// Databricks:
+ /// ```sql
+ /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+ /// ```
+ /// See Databricks
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
OptimizeTable {
/// Table name to optimize.
name: ObjectName,
+ /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE
TABLE`, Databricks uses `OPTIMIZE`).
+ has_table_keyword: bool,
/// Optional cluster identifier.
+ ///
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
on_cluster: Option<Ident>,
/// Optional partition spec.
+ ///
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
partition: Option<Partition>,
/// Whether `FINAL` was specified.
+ ///
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
include_final: bool,
/// Optional deduplication settings.
+ ///
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
deduplicate: Option<Deduplicate>,
+ /// Optional WHERE predicate.
+ ///
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
+ predicate: Option<Expr>,
+ /// Optional ZORDER BY columns.
+ ///
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
+ zorder: Option<Vec<Expr>>,
},
/// ```sql
/// LISTEN
@@ -6069,12 +6087,19 @@ impl fmt::Display for Statement {
}
Statement::OptimizeTable {
name,
+ has_table_keyword,
on_cluster,
partition,
include_final,
deduplicate,
+ predicate,
+ zorder,
} => {
- write!(f, "OPTIMIZE TABLE {name}")?;
+ write!(f, "OPTIMIZE")?;
+ if *has_table_keyword {
+ write!(f, " TABLE")?;
+ }
+ write!(f, " {name}")?;
if let Some(on_cluster) = on_cluster {
write!(f, " ON CLUSTER {on_cluster}")?;
}
@@ -6087,6 +6112,12 @@ impl fmt::Display for Statement {
if let Some(deduplicate) = deduplicate {
write!(f, " {deduplicate}")?;
}
+ if let Some(predicate) = predicate {
+ write!(f, " WHERE {predicate}")?;
+ }
+ if let Some(zorder) = zorder {
+ write!(f, " ZORDER BY ({})",
display_comma_separated(zorder))?;
+ }
Ok(())
}
Statement::LISTEN { channel } => {
diff --git a/src/dialect/databricks.rs b/src/dialect/databricks.rs
index e903b073..55e4f56c 100644
--- a/src/dialect/databricks.rs
+++ b/src/dialect/databricks.rs
@@ -85,4 +85,9 @@ impl Dialect for DatabricksDialect {
fn supports_values_as_table_factor(&self) -> bool {
true
}
+
+ /// See
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
+ fn supports_optimize_table(&self) -> bool {
+ true
+ }
}
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 0b152f2b..c40ed427 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -693,7 +693,6 @@ impl<'a> Parser<'a> {
// `INSTALL` is duckdb specific
https://duckdb.org/docs/extensions/overview
Keyword::INSTALL if self.dialect.supports_install() =>
self.parse_install(),
Keyword::LOAD => self.parse_load(),
- // `OPTIMIZE` is clickhouse specific
https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
Keyword::OPTIMIZE if self.dialect.supports_optimize_table() =>
{
self.parse_optimize_table()
}
@@ -3382,24 +3381,28 @@ impl<'a> Parser<'a> {
///
/// ```sql
/// [field_name] field_type
+ /// field_name: field_type
/// ```
///
/// [struct]:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type
/// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple
+ /// [databricks]:
https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
fn parse_struct_field_def(
&mut self,
) -> Result<(StructField, MatchedTrailingBracket), ParserError> {
// Look beyond the next item to infer whether both field name
// and type are specified.
- let is_anonymous_field = !matches!(
+ let is_named_field = matches!(
(self.peek_nth_token(0).token, self.peek_nth_token(1).token),
- (Token::Word(_), Token::Word(_))
+ (Token::Word(_), Token::Word(_)) | (Token::Word(_), Token::Colon)
);
- let field_name = if is_anonymous_field {
- None
+ let field_name = if is_named_field {
+ let name = self.parse_identifier()?;
+ let _ = self.consume_token(&Token::Colon);
+ Some(name)
} else {
- Some(self.parse_identifier()?)
+ None
};
let (field_type, trailing_bracket) = self.parse_data_type_helper()?;
@@ -7985,7 +7988,8 @@ impl<'a> Parser<'a> {
pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle,
ParserError> {
if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
self.expect_token(&Token::LParen)?;
- let columns =
self.parse_comma_separated(Parser::parse_column_def)?;
+ let columns =
+ self.parse_comma_separated(|parser|
parser.parse_column_def_inner(true))?;
self.expect_token(&Token::RParen)?;
Ok(HiveDistributionStyle::PARTITIONED { columns })
} else {
@@ -8809,9 +8813,19 @@ impl<'a> Parser<'a> {
/// Parse column definition.
pub fn parse_column_def(&mut self) -> Result<ColumnDef, ParserError> {
+ self.parse_column_def_inner(false)
+ }
+
+ fn parse_column_def_inner(
+ &mut self,
+ optional_data_type: bool,
+ ) -> Result<ColumnDef, ParserError> {
let col_name = self.parse_identifier()?;
let data_type = if self.is_column_type_sqlite_unspecified() {
DataType::Unspecified
+ } else if optional_data_type {
+ self.maybe_parse(|parser| parser.parse_data_type())?
+ .unwrap_or(DataType::Unspecified)
} else {
self.parse_data_type()?
};
@@ -11917,7 +11931,8 @@ impl<'a> Parser<'a> {
let field_defs = self.parse_duckdb_struct_type_def()?;
Ok(DataType::Struct(field_defs,
StructBracketKind::Parentheses))
}
- Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect |
GenericDialect) => {
+ Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect |
DatabricksDialect | GenericDialect) =>
+ {
self.prev_token();
let (field_defs, _trailing_bracket) =
self.parse_struct_type_def(Self::parse_struct_field_def)?;
@@ -18480,13 +18495,23 @@ impl<'a> Parser<'a> {
}
}
+ /// ClickHouse:
/// ```sql
/// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition |
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
/// ```
///
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
+ ///
+ /// Databricks:
+ /// ```sql
+ /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+ /// ```
+ ///
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
pub fn parse_optimize_table(&mut self) -> Result<Statement, ParserError> {
- self.expect_keyword_is(Keyword::TABLE)?;
+ let has_table_keyword = self.parse_keyword(Keyword::TABLE);
+
let name = self.parse_object_name(false)?;
+
+ // ClickHouse-specific options
let on_cluster = self.parse_optional_on_cluster()?;
let partition = if self.parse_keyword(Keyword::PARTITION) {
@@ -18500,6 +18525,7 @@ impl<'a> Parser<'a> {
};
let include_final = self.parse_keyword(Keyword::FINAL);
+
let deduplicate = if self.parse_keyword(Keyword::DEDUPLICATE) {
if self.parse_keyword(Keyword::BY) {
Some(Deduplicate::ByExpression(self.parse_expr()?))
@@ -18510,12 +18536,31 @@ impl<'a> Parser<'a> {
None
};
+ // Databricks-specific options
+ let predicate = if self.parse_keyword(Keyword::WHERE) {
+ Some(self.parse_expr()?)
+ } else {
+ None
+ };
+
+ let zorder = if self.parse_keywords(&[Keyword::ZORDER, Keyword::BY]) {
+ self.expect_token(&Token::LParen)?;
+ let columns = self.parse_comma_separated(|p| p.parse_expr())?;
+ self.expect_token(&Token::RParen)?;
+ Some(columns)
+ } else {
+ None
+ };
+
Ok(Statement::OptimizeTable {
name,
+ has_table_keyword,
on_cluster,
partition,
include_final,
deduplicate,
+ predicate,
+ zorder,
})
}
diff --git a/tests/sqlparser_databricks.rs b/tests/sqlparser_databricks.rs
index b088afd7..6a7534ad 100644
--- a/tests/sqlparser_databricks.rs
+++ b/tests/sqlparser_databricks.rs
@@ -390,3 +390,201 @@ fn parse_table_time_travel() {
.parse_sql_statements("SELECT 1 FROM t1 VERSION AS OF 1 - 2",)
.is_err())
}
+
+#[test]
+fn parse_optimize_table() {
+ // Basic OPTIMIZE (Databricks style - no TABLE keyword)
+ databricks().verified_stmt("OPTIMIZE my_table");
+ databricks().verified_stmt("OPTIMIZE db.my_table");
+ databricks().verified_stmt("OPTIMIZE catalog.db.my_table");
+
+ // With WHERE clause
+ databricks().verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01'");
+ databricks()
+ .verified_stmt("OPTIMIZE my_table WHERE date >= '2023-01-01' AND date
< '2023-02-01'");
+
+ // With ZORDER BY clause
+ databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1)");
+ databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1, col2)");
+ databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1, col2,
col3)");
+
+ // Combined WHERE and ZORDER BY
+ databricks().verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01'
ZORDER BY (col1)");
+ databricks()
+ .verified_stmt("OPTIMIZE my_table WHERE date >= '2023-01-01' ZORDER BY
(col1, col2)");
+
+ // Verify AST structure
+ match databricks()
+ .verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01' ZORDER BY
(col1, col2)")
+ {
+ Statement::OptimizeTable {
+ name,
+ has_table_keyword,
+ on_cluster,
+ partition,
+ include_final,
+ deduplicate,
+ predicate,
+ zorder,
+ } => {
+ assert_eq!(name.to_string(), "my_table");
+ assert!(!has_table_keyword);
+ assert!(on_cluster.is_none());
+ assert!(partition.is_none());
+ assert!(!include_final);
+ assert!(deduplicate.is_none());
+ assert!(predicate.is_some());
+ assert_eq!(
+ zorder,
+ Some(vec![
+ Expr::Identifier(Ident::new("col1")),
+ Expr::Identifier(Ident::new("col2")),
+ ])
+ );
+ }
+ _ => unreachable!(),
+ }
+
+ // Negative cases
+ assert_eq!(
+ databricks()
+ .parse_sql_statements("OPTIMIZE my_table ZORDER BY")
+ .unwrap_err(),
+ ParserError::ParserError("Expected: (, found: EOF".to_string())
+ );
+ assert_eq!(
+ databricks()
+ .parse_sql_statements("OPTIMIZE my_table ZORDER BY ()")
+ .unwrap_err(),
+ ParserError::ParserError("Expected: an expression, found:
)".to_string())
+ );
+}
+
+#[test]
+fn parse_create_table_partitioned_by() {
+ // Databricks allows PARTITIONED BY with just column names (referencing
existing columns)
+ //
https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html
+
+ // Single partition column without type
+ databricks().verified_stmt("CREATE TABLE t (col1 STRING, col2 INT)
PARTITIONED BY (col1)");
+
+ // Multiple partition columns without types
+ databricks().verified_stmt(
+ "CREATE TABLE t (col1 STRING, col2 INT, col3 DATE) PARTITIONED BY
(col1, col2)",
+ );
+
+ // Partition columns with types (new columns not in table spec)
+ databricks().verified_stmt("CREATE TABLE t (name STRING) PARTITIONED BY
(year INT, month INT)");
+
+ // Mixed: some with types, some without
+ databricks()
+ .verified_stmt("CREATE TABLE t (id INT, name STRING) PARTITIONED BY
(region, year INT)");
+
+ // Verify AST structure for column without type
+ match databricks().verified_stmt("CREATE TABLE t (col1 STRING) PARTITIONED
BY (col1)") {
+ Statement::CreateTable(CreateTable {
+ name,
+ columns,
+ hive_distribution,
+ ..
+ }) => {
+ assert_eq!(name.to_string(), "t");
+ assert_eq!(columns.len(), 1);
+ assert_eq!(columns[0].name.to_string(), "col1");
+ match hive_distribution {
+ HiveDistributionStyle::PARTITIONED {
+ columns: partition_cols,
+ } => {
+ assert_eq!(partition_cols.len(), 1);
+ assert_eq!(partition_cols[0].name.to_string(), "col1");
+ assert_eq!(partition_cols[0].data_type,
DataType::Unspecified);
+ }
+ _ => unreachable!(),
+ }
+ }
+ _ => unreachable!(),
+ }
+
+ // Verify AST structure for column with type
+ match databricks().verified_stmt("CREATE TABLE t (name STRING) PARTITIONED
BY (year INT)") {
+ Statement::CreateTable(CreateTable {
+ hive_distribution:
+ HiveDistributionStyle::PARTITIONED {
+ columns: partition_cols,
+ },
+ ..
+ }) => {
+ assert_eq!(partition_cols.len(), 1);
+ assert_eq!(partition_cols[0].name.to_string(), "year");
+ assert_eq!(partition_cols[0].data_type, DataType::Int(None));
+ }
+ _ => unreachable!(),
+ }
+}
+
+#[test]
+fn parse_databricks_struct_type() {
+ // Databricks uses colon-separated struct field syntax (colon is optional)
+ //
https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
+
+ // Basic struct with colon syntax - parses to canonical form without colons
+ databricks().one_statement_parses_to(
+ "CREATE TABLE t (col1 STRUCT<field1: STRING, field2: INT>)",
+ "CREATE TABLE t (col1 STRUCT<field1 STRING, field2 INT>)",
+ );
+
+ // Nested array of struct (the original issue case)
+ databricks().one_statement_parses_to(
+ "CREATE TABLE t (col1 ARRAY<STRUCT<finish_flag: STRING, survive_flag:
STRING, score: INT>>)",
+ "CREATE TABLE t (col1 ARRAY<STRUCT<finish_flag STRING, survive_flag
STRING, score INT>>)",
+ );
+
+ // Multiple struct columns
+ databricks().one_statement_parses_to(
+ "CREATE TABLE t (col1 STRUCT<a: INT, b: STRING>, col2 STRUCT<x:
DOUBLE>)",
+ "CREATE TABLE t (col1 STRUCT<a INT, b STRING>, col2 STRUCT<x DOUBLE>)",
+ );
+
+ // Deeply nested structs
+ databricks().one_statement_parses_to(
+ "CREATE TABLE t (col1 STRUCT<outer: STRUCT<inner: STRING>>)",
+ "CREATE TABLE t (col1 STRUCT<outer STRUCT<inner STRING>>)",
+ );
+
+ // Struct with array field
+ databricks().one_statement_parses_to(
+ "CREATE TABLE t (col1 STRUCT<items: ARRAY<INT>, name: STRING>)",
+ "CREATE TABLE t (col1 STRUCT<items ARRAY<INT>, name STRING>)",
+ );
+
+ // Syntax without colons should also work (BigQuery compatible)
+ databricks().verified_stmt("CREATE TABLE t (col1 STRUCT<field1 STRING,
field2 INT>)");
+
+ // Verify AST structure
+ match databricks().one_statement_parses_to(
+ "CREATE TABLE t (col1 STRUCT<field1: STRING, field2: INT>)",
+ "CREATE TABLE t (col1 STRUCT<field1 STRING, field2 INT>)",
+ ) {
+ Statement::CreateTable(CreateTable { columns, .. }) => {
+ assert_eq!(columns.len(), 1);
+ assert_eq!(columns[0].name.to_string(), "col1");
+ match &columns[0].data_type {
+ DataType::Struct(fields, StructBracketKind::AngleBrackets) => {
+ assert_eq!(fields.len(), 2);
+ assert_eq!(
+ fields[0].field_name.as_ref().map(|i| i.to_string()),
+ Some("field1".to_string())
+ );
+ assert_eq!(fields[0].field_type, DataType::String(None));
+ assert_eq!(
+ fields[1].field_name.as_ref().map(|i| i.to_string()),
+ Some("field2".to_string())
+ );
+ assert_eq!(fields[1].field_type, DataType::Int(None));
+ }
+ _ => unreachable!(),
+ }
+ }
+ _ => unreachable!(),
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]