This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 29874ba54f fix (#15275)
29874ba54f is described below
commit 29874ba54fff867f9d533875c1ecb785a4c28956
Author: Qi Zhu <[email protected]>
AuthorDate: Wed Mar 19 01:09:07 2025 +0800
fix (#15275)
---
datafusion/expr-common/src/type_coercion/binary.rs | 148 ++++++++++++++++++---
datafusion/physical-expr/src/expressions/binary.rs | 66 ++++++++-
.../sqllogictest/test_files/string/string_view.slt | 8 +-
3 files changed, 194 insertions(+), 28 deletions(-)
diff --git a/datafusion/expr-common/src/type_coercion/binary.rs
b/datafusion/expr-common/src/type_coercion/binary.rs
index 682cc885cd..fb559e163b 100644
--- a/datafusion/expr-common/src/type_coercion/binary.rs
+++ b/datafusion/expr-common/src/type_coercion/binary.rs
@@ -1177,26 +1177,6 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataT
}
}
-/// This will be deprecated when binary operators native support
-/// for Utf8View (use `string_coercion` instead).
-fn regex_comparison_string_coercion(
- lhs_type: &DataType,
- rhs_type: &DataType,
-) -> Option<DataType> {
- use arrow::datatypes::DataType::*;
- match (lhs_type, rhs_type) {
- // If Utf8View is in any side, we coerce to Utf8.
- (Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View)
=> {
- Some(Utf8)
- }
- // Then, if LargeUtf8 is in any side, we coerce to LargeUtf8.
- (LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8),
- // Utf8 coerces to Utf8
- (Utf8, Utf8) => Some(Utf8),
- _ => None,
- }
-}
-
fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
@@ -1327,7 +1307,7 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataT
/// Coercion rules for regular expression comparison operations.
/// This is a union of string coercion rules and dictionary coercion rules
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
- regex_comparison_string_coercion(lhs_type, rhs_type)
+ string_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
}
@@ -1802,42 +1782,168 @@ mod tests {
Operator::RegexMatch,
DataType::Utf8
);
+ test_coercion_binary_rule!(
+ DataType::Utf8,
+ DataType::Utf8View,
+ Operator::RegexMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Utf8View,
+ DataType::Utf8,
+ Operator::RegexMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Utf8View,
+ DataType::Utf8View,
+ Operator::RegexMatch,
+ DataType::Utf8View
+ );
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Utf8,
Operator::RegexNotMatch,
DataType::Utf8
);
+ test_coercion_binary_rule!(
+ DataType::Utf8View,
+ DataType::Utf8,
+ Operator::RegexNotMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Utf8,
+ DataType::Utf8View,
+ Operator::RegexNotMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Utf8View,
+ DataType::Utf8View,
+ Operator::RegexNotMatch,
+ DataType::Utf8View
+ );
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Utf8,
Operator::RegexNotIMatch,
DataType::Utf8
);
+ test_coercion_binary_rule!(
+ DataType::Utf8View,
+ DataType::Utf8,
+ Operator::RegexNotIMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Utf8,
+ DataType::Utf8View,
+ Operator::RegexNotIMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Utf8View,
+ DataType::Utf8View,
+ Operator::RegexNotIMatch,
+ DataType::Utf8View
+ );
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
DataType::Utf8,
Operator::RegexMatch,
DataType::Utf8
);
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
+ DataType::Utf8View,
+ Operator::RegexMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8View.into()),
+ DataType::Utf8,
+ Operator::RegexMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8View.into()),
+ DataType::Utf8View,
+ Operator::RegexMatch,
+ DataType::Utf8View
+ );
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
DataType::Utf8,
Operator::RegexIMatch,
DataType::Utf8
);
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8View.into()),
+ DataType::Utf8,
+ Operator::RegexIMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
+ DataType::Utf8View,
+ Operator::RegexIMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8View.into()),
+ DataType::Utf8View,
+ Operator::RegexIMatch,
+ DataType::Utf8View
+ );
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
DataType::Utf8,
Operator::RegexNotMatch,
DataType::Utf8
);
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
+ DataType::Utf8View,
+ Operator::RegexNotMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8View.into()),
+ DataType::Utf8,
+ Operator::RegexNotMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
+ DataType::Utf8View,
+ Operator::RegexNotMatch,
+ DataType::Utf8View
+ );
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
DataType::Utf8,
Operator::RegexNotIMatch,
DataType::Utf8
);
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8View.into()),
+ DataType::Utf8,
+ Operator::RegexNotIMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8.into()),
+ DataType::Utf8View,
+ Operator::RegexNotIMatch,
+ DataType::Utf8View
+ );
+ test_coercion_binary_rule!(
+ DataType::Dictionary(DataType::Int32.into(),
DataType::Utf8View.into()),
+ DataType::Utf8View,
+ Operator::RegexNotIMatch,
+ DataType::Utf8View
+ );
test_coercion_binary_rule!(
DataType::Int16,
DataType::Int64,
diff --git a/datafusion/physical-expr/src/expressions/binary.rs
b/datafusion/physical-expr/src/expressions/binary.rs
index 872773b06f..a00d135ef3 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -168,9 +168,12 @@ fn boolean_op(
macro_rules! binary_string_array_flag_op {
($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
match $LEFT.data_type() {
- DataType::Utf8View | DataType::Utf8 => {
+ DataType::Utf8 => {
compute_utf8_flag_op!($LEFT, $RIGHT, $OP, StringArray, $NOT,
$FLAG)
},
+ DataType::Utf8View => {
+ compute_utf8view_flag_op!($LEFT, $RIGHT, $OP, StringViewArray,
$NOT, $FLAG)
+ }
DataType::LargeUtf8 => {
compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray,
$NOT, $FLAG)
},
@@ -207,14 +210,42 @@ macro_rules! compute_utf8_flag_op {
}};
}
+/// Invoke a compute kernel on a pair of binary data arrays with flags
+macro_rules! compute_utf8view_flag_op {
+ ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr,
$FLAG:expr) => {{
+ let ll = $LEFT
+ .as_any()
+ .downcast_ref::<$ARRAYTYPE>()
+ .expect("compute_utf8view_flag_op failed to downcast array");
+ let rr = $RIGHT
+ .as_any()
+ .downcast_ref::<$ARRAYTYPE>()
+ .expect("compute_utf8view_flag_op failed to downcast array");
+
+ let flag = if $FLAG {
+ Some($ARRAYTYPE::from(vec!["i"; ll.len()]))
+ } else {
+ None
+ };
+ let mut array = $OP(ll, rr, flag.as_ref())?;
+ if $NOT {
+ array = not(&array).unwrap();
+ }
+ Ok(Arc::new(array))
+ }};
+}
+
macro_rules! binary_string_array_flag_op_scalar {
($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
// This macro is slightly different from binary_string_array_flag_op
because, when comparing with a scalar value,
// the query can be optimized in such a way that operands will be
dicts, so we need to support it here
let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
- DataType::Utf8View | DataType::Utf8 => {
+ DataType::Utf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray,
$NOT, $FLAG)
},
+ DataType::Utf8View => {
+ compute_utf8view_flag_op_scalar!($LEFT, $RIGHT, $OP,
StringViewArray, $NOT, $FLAG)
+ }
DataType::LargeUtf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP,
LargeStringArray, $NOT, $FLAG)
},
@@ -222,7 +253,8 @@ macro_rules! binary_string_array_flag_op_scalar {
let values = $LEFT.as_any_dictionary().values();
match values.data_type() {
- DataType::Utf8View | DataType::Utf8 =>
compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG),
+ DataType::Utf8 => compute_utf8_flag_op_scalar!(values,
$RIGHT, $OP, StringArray, $NOT, $FLAG),
+ DataType::Utf8View =>
compute_utf8view_flag_op_scalar!(values, $RIGHT, $OP, StringViewArray, $NOT,
$FLAG),
DataType::LargeUtf8 =>
compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT,
$FLAG),
other => internal_err!(
"Data type {:?} not supported as a dictionary value
type for binary_string_array_flag_op_scalar operation '{}' on string array",
@@ -276,6 +308,34 @@ macro_rules! compute_utf8_flag_op_scalar {
}};
}
+/// Invoke a compute kernel on a data array and a scalar value with flag
+macro_rules! compute_utf8view_flag_op_scalar {
+ ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr,
$FLAG:expr) => {{
+ let ll = $LEFT
+ .as_any()
+ .downcast_ref::<$ARRAYTYPE>()
+ .expect("compute_utf8view_flag_op_scalar failed to downcast
array");
+
+ let string_value = match $RIGHT.try_as_str() {
+ Some(Some(string_value)) => string_value,
+ // null literal or non string
+ _ => return internal_err!(
+ "compute_utf8view_flag_op_scalar failed to cast
literal value {} for operation '{}'",
+ $RIGHT, stringify!($OP)
+ )
+ };
+
+ let flag = $FLAG.then_some("i");
+ let mut array =
+ paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
+ if $NOT {
+ array = not(&array).unwrap();
+ }
+
+ Ok(Arc::new(array))
+ }};
+}
+
impl PhysicalExpr for BinaryExpr {
/// Return a reference to Any that can be used for downcasting
fn as_any(&self) -> &dyn Any {
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt
b/datafusion/sqllogictest/test_files/string/string_view.slt
index 69c4b9bfcb..96fb247759 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -1100,7 +1100,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
-01)Projection: CAST(test.column1_utf8view AS Utf8) LIKE Utf8("%an%") AS c1
+01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1
02)--TableScan: test projection=[column1_utf8view]
# `~*` operator (regex match case-insensitive)
@@ -1110,7 +1110,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
-01)Projection: CAST(test.column1_utf8view AS Utf8) ~* Utf8("^a.{3}e") AS c1
+01)Projection: test.column1_utf8view ~* Utf8View("^a.{3}e") AS c1
02)--TableScan: test projection=[column1_utf8view]
# `!~~` operator (not like match)
@@ -1120,7 +1120,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
-01)Projection: CAST(test.column1_utf8view AS Utf8) !~~ Utf8("xia_g%g") AS c1
+01)Projection: test.column1_utf8view !~~ Utf8View("xia_g%g") AS c1
02)--TableScan: test projection=[column1_utf8view]
# `!~~*` operator (not like match case-insensitive)
@@ -1130,7 +1130,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
-01)Projection: CAST(test.column1_utf8view AS Utf8) !~~* Utf8("xia_g%g") AS c1
+01)Projection: test.column1_utf8view !~~* Utf8View("xia_g%g") AS c1
02)--TableScan: test projection=[column1_utf8view]
# coercions between stringview and date types
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]