This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b0925c801e Support alternate format for Utf8 unparsing (CHAR) (#11494)
b0925c801e is described below
commit b0925c801e1c07bd78c78b045ab58fbd0630b638
Author: Sergei Grebnov <[email protected]>
AuthorDate: Wed Jul 17 04:21:34 2024 -0700
Support alternate format for Utf8 unparsing (CHAR) (#11494)
* Add dialect param to use CHAR instead of TEXT for Utf8 unparsing for
MySQL (#12)
* Configurable data type instead of flag for Utf8 unparsing
* Fix type in comment
---
datafusion/sql/src/unparser/dialect.rs | 52 +++++++++++++++++++++++++++++++++-
datafusion/sql/src/unparser/expr.rs | 34 ++++++++++++++++++++--
2 files changed, 83 insertions(+), 3 deletions(-)
diff --git a/datafusion/sql/src/unparser/dialect.rs
b/datafusion/sql/src/unparser/dialect.rs
index eca2eb4fd0..87453f81ee 100644
--- a/datafusion/sql/src/unparser/dialect.rs
+++ b/datafusion/sql/src/unparser/dialect.rs
@@ -16,7 +16,7 @@
// under the License.
use regex::Regex;
-use sqlparser::keywords::ALL_KEYWORDS;
+use sqlparser::{ast, keywords::ALL_KEYWORDS};
/// `Dialect` to use for Unparsing
///
@@ -45,6 +45,17 @@ pub trait Dialect {
fn interval_style(&self) -> IntervalStyle {
IntervalStyle::PostgresVerbose
}
+
+ // The SQL type to use for Arrow Utf8 unparsing
+ // Most dialects use VARCHAR, but some, like MySQL, require CHAR
+ fn utf8_cast_dtype(&self) -> ast::DataType {
+ ast::DataType::Varchar(None)
+ }
+ // The SQL type to use for Arrow LargeUtf8 unparsing
+ // Most dialects use TEXT, but some, like MySQL, require CHAR
+ fn large_utf8_cast_dtype(&self) -> ast::DataType {
+ ast::DataType::Text
+ }
}
/// `IntervalStyle` to use for unparsing
@@ -103,6 +114,14 @@ impl Dialect for MySqlDialect {
fn interval_style(&self) -> IntervalStyle {
IntervalStyle::MySQL
}
+
+ fn utf8_cast_dtype(&self) -> ast::DataType {
+ ast::DataType::Char(None)
+ }
+
+ fn large_utf8_cast_dtype(&self) -> ast::DataType {
+ ast::DataType::Char(None)
+ }
}
pub struct SqliteDialect {}
@@ -118,6 +137,8 @@ pub struct CustomDialect {
supports_nulls_first_in_sort: bool,
use_timestamp_for_date64: bool,
interval_style: IntervalStyle,
+ utf8_cast_dtype: ast::DataType,
+ large_utf8_cast_dtype: ast::DataType,
}
impl Default for CustomDialect {
@@ -127,6 +148,8 @@ impl Default for CustomDialect {
supports_nulls_first_in_sort: true,
use_timestamp_for_date64: false,
interval_style: IntervalStyle::SQLStandard,
+ utf8_cast_dtype: ast::DataType::Varchar(None),
+ large_utf8_cast_dtype: ast::DataType::Text,
}
}
}
@@ -158,6 +181,14 @@ impl Dialect for CustomDialect {
fn interval_style(&self) -> IntervalStyle {
self.interval_style
}
+
+ fn utf8_cast_dtype(&self) -> ast::DataType {
+ self.utf8_cast_dtype.clone()
+ }
+
+ fn large_utf8_cast_dtype(&self) -> ast::DataType {
+ self.large_utf8_cast_dtype.clone()
+ }
}
/// `CustomDialectBuilder` to build `CustomDialect` using builder pattern
@@ -179,6 +210,8 @@ pub struct CustomDialectBuilder {
supports_nulls_first_in_sort: bool,
use_timestamp_for_date64: bool,
interval_style: IntervalStyle,
+ utf8_cast_dtype: ast::DataType,
+ large_utf8_cast_dtype: ast::DataType,
}
impl Default for CustomDialectBuilder {
@@ -194,6 +227,8 @@ impl CustomDialectBuilder {
supports_nulls_first_in_sort: true,
use_timestamp_for_date64: false,
interval_style: IntervalStyle::PostgresVerbose,
+ utf8_cast_dtype: ast::DataType::Varchar(None),
+ large_utf8_cast_dtype: ast::DataType::Text,
}
}
@@ -203,6 +238,8 @@ impl CustomDialectBuilder {
supports_nulls_first_in_sort: self.supports_nulls_first_in_sort,
use_timestamp_for_date64: self.use_timestamp_for_date64,
interval_style: self.interval_style,
+ utf8_cast_dtype: self.utf8_cast_dtype,
+ large_utf8_cast_dtype: self.large_utf8_cast_dtype,
}
}
@@ -235,4 +272,17 @@ impl CustomDialectBuilder {
self.interval_style = interval_style;
self
}
+
+ pub fn with_utf8_cast_dtype(mut self, utf8_cast_dtype: ast::DataType) ->
Self {
+ self.utf8_cast_dtype = utf8_cast_dtype;
+ self
+ }
+
+ pub fn with_large_utf8_cast_dtype(
+ mut self,
+ large_utf8_cast_dtype: ast::DataType,
+ ) -> Self {
+ self.large_utf8_cast_dtype = large_utf8_cast_dtype;
+ self
+ }
}
diff --git a/datafusion/sql/src/unparser/expr.rs
b/datafusion/sql/src/unparser/expr.rs
index e6b67b5d9f..950e7e1128 100644
--- a/datafusion/sql/src/unparser/expr.rs
+++ b/datafusion/sql/src/unparser/expr.rs
@@ -1275,8 +1275,8 @@ impl Unparser<'_> {
DataType::BinaryView => {
not_impl_err!("Unsupported DataType: conversion:
{data_type:?}")
}
- DataType::Utf8 => Ok(ast::DataType::Varchar(None)),
- DataType::LargeUtf8 => Ok(ast::DataType::Text),
+ DataType::Utf8 => Ok(self.dialect.utf8_cast_dtype()),
+ DataType::LargeUtf8 => Ok(self.dialect.large_utf8_cast_dtype()),
DataType::Utf8View => {
not_impl_err!("Unsupported DataType: conversion:
{data_type:?}")
}
@@ -1936,4 +1936,34 @@ mod tests {
assert_eq!(actual, expected);
}
}
+
+ #[test]
+ fn custom_dialect_use_char_for_utf8_cast() -> Result<()> {
+ let default_dialect = CustomDialectBuilder::default().build();
+ let mysql_custom_dialect = CustomDialectBuilder::new()
+ .with_utf8_cast_dtype(ast::DataType::Char(None))
+ .with_large_utf8_cast_dtype(ast::DataType::Char(None))
+ .build();
+
+ for (dialect, data_type, identifier) in [
+ (&default_dialect, DataType::Utf8, "VARCHAR"),
+ (&default_dialect, DataType::LargeUtf8, "TEXT"),
+ (&mysql_custom_dialect, DataType::Utf8, "CHAR"),
+ (&mysql_custom_dialect, DataType::LargeUtf8, "CHAR"),
+ ] {
+ let unparser = Unparser::new(dialect);
+
+ let expr = Expr::Cast(Cast {
+ expr: Box::new(col("a")),
+ data_type,
+ });
+ let ast = unparser.expr_to_sql(&expr)?;
+
+ let actual = format!("{}", ast);
+ let expected = format!(r#"CAST(a AS {identifier})"#);
+
+ assert_eq!(actual, expected);
+ }
+ Ok(())
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]