This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 4534a285c6 Support DictionaryString for Regex matching operators
(#12768)
4534a285c6 is described below
commit 4534a285c649be8fe7dde56b25078d77d82cf002
Author: Dmitrii Blaginin <[email protected]>
AuthorDate: Thu Oct 10 21:28:50 2024 +0100
Support DictionaryString for Regex matching operators (#12768)
* Do not optimise regex casts in `UnwrapCastInComparison`
* Mark regex operators are non-comparison ones
* Also remove is distinct from
* Rename method for `supports_propagation` for clarity
* Remove duplicated tests
* Fix doctest
* Revert `supports_propagation` and handle dicts in
`binary_string_array_flag_op_scalar`
* Update tabulation
* Minimise code inside `downcast_dictionary_array!` to fix wasm-pack build
* Switch to `as_any_dictionary`
* Use `take_iter`
* Revert `is_comparison_operator` and make it deprecated
---
datafusion/expr-common/src/interval_arithmetic.rs | 2 +-
datafusion/expr-common/src/operator.rs | 16 ++++--
.../optimizer/src/unwrap_cast_in_comparison.rs | 2 +-
datafusion/physical-expr/src/expressions/binary.rs | 59 +++++++++++++++++-----
.../test_files/string/large_string.slt | 13 -----
.../sqllogictest/test_files/string/string.slt | 13 -----
.../test_files/string/string_query.slt.part | 22 ++++----
.../sqllogictest/test_files/string/string_view.slt | 13 -----
8 files changed, 72 insertions(+), 68 deletions(-)
diff --git a/datafusion/expr-common/src/interval_arithmetic.rs
b/datafusion/expr-common/src/interval_arithmetic.rs
index 6424888c89..e76453d91a 100644
--- a/datafusion/expr-common/src/interval_arithmetic.rs
+++ b/datafusion/expr-common/src/interval_arithmetic.rs
@@ -1753,7 +1753,7 @@ impl NullableInterval {
}
_ => Ok(Self::MaybeNull { values }),
}
- } else if op.is_comparison_operator() {
+ } else if op.supports_propagation() {
Ok(Self::Null {
datatype: DataType::Boolean,
})
diff --git a/datafusion/expr-common/src/operator.rs
b/datafusion/expr-common/src/operator.rs
index e013b6fafa..6ca0f04897 100644
--- a/datafusion/expr-common/src/operator.rs
+++ b/datafusion/expr-common/src/operator.rs
@@ -142,10 +142,11 @@ impl Operator {
)
}
- /// Return true if the operator is a comparison operator.
+ /// Return true if the comparison operator can be used in interval
arithmetic and constraint
+ /// propagation
///
- /// For example, 'Binary(a, >, b)' would be a comparison expression.
- pub fn is_comparison_operator(&self) -> bool {
+ /// For example, 'Binary(a, >, b)' expression supports propagation.
+ pub fn supports_propagation(&self) -> bool {
matches!(
self,
Operator::Eq
@@ -163,6 +164,15 @@ impl Operator {
)
}
+ /// Return true if the comparison operator can be used in interval
arithmetic and constraint
+ /// propagation
+ ///
+ /// For example, 'Binary(a, >, b)' expression supports propagation.
+ #[deprecated(since = "43.0.0", note = "please use `supports_propagation`
instead")]
+ pub fn is_comparison_operator(&self) -> bool {
+ self.supports_propagation()
+ }
+
/// Return true if the operator is a logic operator.
///
/// For example, 'Binary(Binary(a, >, b), AND, Binary(a, <, b + 3))' would
diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs
b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs
index 22e3c0ddd0..31e21d08b5 100644
--- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs
+++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs
@@ -146,7 +146,7 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
};
is_supported_type(&left_type)
&& is_supported_type(&right_type)
- && op.is_comparison_operator()
+ && op.supports_propagation()
} =>
{
match (left.as_mut(), right.as_mut()) {
diff --git a/datafusion/physical-expr/src/expressions/binary.rs
b/datafusion/physical-expr/src/expressions/binary.rs
index 3d9072c2e1..47b04a876b 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -186,7 +186,9 @@ macro_rules! compute_utf8_flag_op {
}
macro_rules! binary_string_array_flag_op_scalar {
- ($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
+ ($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
+ // This macro is slightly different from binary_string_array_flag_op
because, when comparing with a scalar value,
+ // the query can be optimized in such a way that operands will be
dicts, so we need to support it here
let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
DataType::Utf8View | DataType::Utf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray,
$NOT, $FLAG)
@@ -194,6 +196,27 @@ macro_rules! binary_string_array_flag_op_scalar {
DataType::LargeUtf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP,
LargeStringArray, $NOT, $FLAG)
},
+ DataType::Dictionary(_, _) => {
+ let values = $LEFT.as_any_dictionary().values();
+
+ match values.data_type() {
+ DataType::Utf8View | DataType::Utf8 =>
compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG),
+ DataType::LargeUtf8 =>
compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT,
$FLAG),
+ other => internal_err!(
+ "Data type {:?} not supported as a dictionary value
type for binary_string_array_flag_op_scalar operation '{}' on string array",
+ other, stringify!($OP)
+ ),
+ }.map(
+ // downcast_dictionary_array duplicates code per possible
key type, so we aim to do all prep work before
+ |evaluated_values| downcast_dictionary_array! {
+ $LEFT => {
+ let unpacked_dict =
evaluated_values.take_iter($LEFT.keys().iter().map(|opt| opt.map(|v| v as
_))).collect::<BooleanArray>();
+ Arc::new(unpacked_dict) as _
+ },
+ _ => unreachable!(),
+ }
+ )
+ },
other => internal_err!(
"Data type {:?} not supported for
binary_string_array_flag_op_scalar operation '{}' on string array",
other, stringify!($OP)
@@ -211,20 +234,32 @@ macro_rules! compute_utf8_flag_op_scalar {
.downcast_ref::<$ARRAYTYPE>()
.expect("compute_utf8_flag_op_scalar failed to downcast array");
- if let ScalarValue::Utf8(Some(string_value)) |
ScalarValue::LargeUtf8(Some(string_value)) = $RIGHT {
- let flag = $FLAG.then_some("i");
- let mut array =
- paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
- if $NOT {
- array = not(&array).unwrap();
- }
- Ok(Arc::new(array))
- } else {
- internal_err!(
+ let string_value = match $RIGHT {
+ ScalarValue::Utf8(Some(string_value)) |
ScalarValue::LargeUtf8(Some(string_value)) => string_value,
+ ScalarValue::Dictionary(_, value) => {
+ match *value {
+ ScalarValue::Utf8(Some(string_value)) |
ScalarValue::LargeUtf8(Some(string_value)) => string_value,
+ other => return internal_err!(
+ "compute_utf8_flag_op_scalar failed to cast
dictionary value {} for operation '{}'",
+ other, stringify!($OP)
+ )
+ }
+ },
+ _ => return internal_err!(
"compute_utf8_flag_op_scalar failed to cast literal value {}
for operation '{}'",
$RIGHT, stringify!($OP)
)
+
+ };
+
+ let flag = $FLAG.then_some("i");
+ let mut array =
+ paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
+ if $NOT {
+ array = not(&array).unwrap();
}
+
+ Ok(Arc::new(array))
}};
}
@@ -429,7 +464,7 @@ impl PhysicalExpr for BinaryExpr {
// end-points of its children.
Ok(Some(vec![]))
}
- } else if self.op.is_comparison_operator() {
+ } else if self.op.supports_propagation() {
Ok(
propagate_comparison(&self.op, interval, left_interval,
right_interval)?
.map(|(left, right)| vec![left, right]),
diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt
b/datafusion/sqllogictest/test_files/string/large_string.slt
index af6d104e57..8d8a5711bd 100644
--- a/datafusion/sqllogictest/test_files/string/large_string.slt
+++ b/datafusion/sqllogictest/test_files/string/large_string.slt
@@ -59,19 +59,6 @@ Xiangpeng datafusion数据融合 false true false true
Raphael datafusionДатаФусион false false false false
NULL NULL NULL NULL NULL NULL
-# TODO: move it back to `string_query.slt.part` after fixing the issue
-# https://github.com/apache/datafusion/issues/12618
-query BB
-SELECT
- ascii_1 ~* '^a.{3}e',
- unicode_1 ~* '^d.*Фу'
-FROM test_basic_operator;
-----
-true false
-false false
-false true
-NULL NULL
-
#
# common test for string-like functions and operators
#
diff --git a/datafusion/sqllogictest/test_files/string/string.slt
b/datafusion/sqllogictest/test_files/string/string.slt
index f003e01ecd..e84342abd3 100644
--- a/datafusion/sqllogictest/test_files/string/string.slt
+++ b/datafusion/sqllogictest/test_files/string/string.slt
@@ -34,19 +34,6 @@ statement ok
create table test_substr as
select arrow_cast(col1, 'Utf8') as c1 from test_substr_base;
-# TODO: move it back to `string_query.slt.part` after fixing the issue
-# https://github.com/apache/datafusion/issues/12618
-query BB
-SELECT
- ascii_1 ~* '^a.{3}e',
- unicode_1 ~* '^d.*Фу'
-FROM test_basic_operator;
-----
-true false
-false false
-false true
-NULL NULL
-
# TODO: move it back to `string_query.slt.part` after fixing the issue
# see detail: https://github.com/apache/datafusion/issues/12637
# Test pattern with wildcard characters
diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part
b/datafusion/sqllogictest/test_files/string/string_query.slt.part
index 6a02296f5e..dc5626b7d5 100644
--- a/datafusion/sqllogictest/test_files/string/string_query.slt.part
+++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part
@@ -642,18 +642,16 @@ true false
false true
NULL NULL
-# TODO: DictionaryString does not support ~* operator. Enable this after
fixing the issue
-# see issue: https://github.com/apache/datafusion/issues/12618
-#query BB
-#SELECT
-# ascii_1 ~* '^a.{3}e',
-# unicode_1 ~* '^d.*Фу'
-#FROM test_basic_operator;
-#----
-#true false
-#false false
-#false true
-#NULL NULL
+query BB
+SELECT
+ ascii_1 ~* '^a.{3}e',
+ unicode_1 ~* '^d.*Фу'
+FROM test_basic_operator;
+----
+true false
+false false
+false true
+NULL NULL
query BB
SELECT
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt
b/datafusion/sqllogictest/test_files/string/string_view.slt
index e01a40586f..19bea3bf6b 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -37,19 +37,6 @@ select arrow_cast(col1, 'Utf8View') as c1 from
test_substr_base;
statement ok
drop table test_source
-# TODO: move it back to `string_query.slt.part` after fixing the issue
-# https://github.com/apache/datafusion/issues/12618
-query BB
-SELECT
- ascii_1 ~* '^a.{3}e',
- unicode_1 ~* '^d.*Фу'
-FROM test_basic_operator;
-----
-true false
-false false
-false true
-NULL NULL
-
#
# common test for string-like functions and operators
#
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]