This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 4534a285c6 Support DictionaryString for Regex matching operators 
(#12768)
4534a285c6 is described below

commit 4534a285c649be8fe7dde56b25078d77d82cf002
Author: Dmitrii Blaginin <[email protected]>
AuthorDate: Thu Oct 10 21:28:50 2024 +0100

    Support DictionaryString for Regex matching operators (#12768)
    
    * Do not optimise regex casts in `UnwrapCastInComparison`
    
    * Mark regex operators are non-comparison ones
    
    * Also remove is distinct from
    
    * Rename method for `supports_propagation` for clarity
    
    * Remove duplicated tests
    
    * Fix doctest
    
    * Revert `supports_propagation` and handle dicts in 
`binary_string_array_flag_op_scalar`
    
    * Update tabulation
    
    * Minimise code inside `downcast_dictionary_array!` to fix wasm-pack build
    
    * Switch to `as_any_dictionary`
    
    * Use `take_iter`
    
    * Revert `is_comparison_operator` and make it deprecated
---
 datafusion/expr-common/src/interval_arithmetic.rs  |  2 +-
 datafusion/expr-common/src/operator.rs             | 16 ++++--
 .../optimizer/src/unwrap_cast_in_comparison.rs     |  2 +-
 datafusion/physical-expr/src/expressions/binary.rs | 59 +++++++++++++++++-----
 .../test_files/string/large_string.slt             | 13 -----
 .../sqllogictest/test_files/string/string.slt      | 13 -----
 .../test_files/string/string_query.slt.part        | 22 ++++----
 .../sqllogictest/test_files/string/string_view.slt | 13 -----
 8 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/datafusion/expr-common/src/interval_arithmetic.rs 
b/datafusion/expr-common/src/interval_arithmetic.rs
index 6424888c89..e76453d91a 100644
--- a/datafusion/expr-common/src/interval_arithmetic.rs
+++ b/datafusion/expr-common/src/interval_arithmetic.rs
@@ -1753,7 +1753,7 @@ impl NullableInterval {
                         }
                         _ => Ok(Self::MaybeNull { values }),
                     }
-                } else if op.is_comparison_operator() {
+                } else if op.supports_propagation() {
                     Ok(Self::Null {
                         datatype: DataType::Boolean,
                     })
diff --git a/datafusion/expr-common/src/operator.rs 
b/datafusion/expr-common/src/operator.rs
index e013b6fafa..6ca0f04897 100644
--- a/datafusion/expr-common/src/operator.rs
+++ b/datafusion/expr-common/src/operator.rs
@@ -142,10 +142,11 @@ impl Operator {
         )
     }
 
-    /// Return true if the operator is a comparison operator.
+    /// Return true if the comparison operator can be used in interval 
arithmetic and constraint
+    /// propagation
     ///
-    /// For example, 'Binary(a, >, b)' would be a comparison expression.
-    pub fn is_comparison_operator(&self) -> bool {
+    /// For example, 'Binary(a, >, b)' expression supports propagation.
+    pub fn supports_propagation(&self) -> bool {
         matches!(
             self,
             Operator::Eq
@@ -163,6 +164,15 @@ impl Operator {
         )
     }
 
+    /// Return true if the comparison operator can be used in interval 
arithmetic and constraint
+    /// propagation
+    ///
+    /// For example, 'Binary(a, >, b)' expression supports propagation.
+    #[deprecated(since = "43.0.0", note = "please use `supports_propagation` 
instead")]
+    pub fn is_comparison_operator(&self) -> bool {
+        self.supports_propagation()
+    }
+
     /// Return true if the operator is a logic operator.
     ///
     /// For example, 'Binary(Binary(a, >, b), AND, Binary(a, <, b + 3))' would
diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs 
b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs
index 22e3c0ddd0..31e21d08b5 100644
--- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs
+++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs
@@ -146,7 +146,7 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
                     };
                     is_supported_type(&left_type)
                         && is_supported_type(&right_type)
-                        && op.is_comparison_operator()
+                        && op.supports_propagation()
                 } =>
             {
                 match (left.as_mut(), right.as_mut()) {
diff --git a/datafusion/physical-expr/src/expressions/binary.rs 
b/datafusion/physical-expr/src/expressions/binary.rs
index 3d9072c2e1..47b04a876b 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -186,7 +186,9 @@ macro_rules! compute_utf8_flag_op {
 }
 
 macro_rules! binary_string_array_flag_op_scalar {
-    ($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
+    ($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
+        // This macro is slightly different from binary_string_array_flag_op 
because, when comparing with a scalar value,
+        // the query can be optimized in such a way that operands will be 
dicts, so we need to support it here
         let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
             DataType::Utf8View | DataType::Utf8 => {
                 compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, 
$NOT, $FLAG)
@@ -194,6 +196,27 @@ macro_rules! binary_string_array_flag_op_scalar {
             DataType::LargeUtf8 => {
                 compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, 
LargeStringArray, $NOT, $FLAG)
             },
+            DataType::Dictionary(_, _) => {
+                let values = $LEFT.as_any_dictionary().values();
+
+                match values.data_type() {
+                    DataType::Utf8View | DataType::Utf8 => 
compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG),
+                    DataType::LargeUtf8 => 
compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, 
$FLAG),
+                    other => internal_err!(
+                        "Data type {:?} not supported as a dictionary value 
type for binary_string_array_flag_op_scalar operation '{}' on string array",
+                        other, stringify!($OP)
+                    ),
+                }.map(
+                    // downcast_dictionary_array duplicates code per possible 
key type, so we aim to do all prep work before
+                    |evaluated_values| downcast_dictionary_array! {
+                        $LEFT => {
+                            let unpacked_dict = 
evaluated_values.take_iter($LEFT.keys().iter().map(|opt| opt.map(|v| v as 
_))).collect::<BooleanArray>();
+                            Arc::new(unpacked_dict) as _
+                        },
+                        _ => unreachable!(),
+                    }
+                )
+            },
             other => internal_err!(
                 "Data type {:?} not supported for 
binary_string_array_flag_op_scalar operation '{}' on string array",
                 other, stringify!($OP)
@@ -211,20 +234,32 @@ macro_rules! compute_utf8_flag_op_scalar {
             .downcast_ref::<$ARRAYTYPE>()
             .expect("compute_utf8_flag_op_scalar failed to downcast array");
 
-        if let ScalarValue::Utf8(Some(string_value)) | 
ScalarValue::LargeUtf8(Some(string_value)) = $RIGHT {
-            let flag = $FLAG.then_some("i");
-            let mut array =
-                paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
-            if $NOT {
-                array = not(&array).unwrap();
-            }
-            Ok(Arc::new(array))
-        } else {
-            internal_err!(
+        let string_value = match $RIGHT {
+            ScalarValue::Utf8(Some(string_value)) | 
ScalarValue::LargeUtf8(Some(string_value)) => string_value,
+            ScalarValue::Dictionary(_, value) => {
+                match *value {
+                    ScalarValue::Utf8(Some(string_value)) | 
ScalarValue::LargeUtf8(Some(string_value)) => string_value,
+                    other => return internal_err!(
+                            "compute_utf8_flag_op_scalar failed to cast 
dictionary value {} for operation '{}'",
+                            other, stringify!($OP)
+                        )
+                }
+            },
+            _ => return internal_err!(
                 "compute_utf8_flag_op_scalar failed to cast literal value {} 
for operation '{}'",
                 $RIGHT, stringify!($OP)
             )
+
+        };
+
+        let flag = $FLAG.then_some("i");
+        let mut array =
+            paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
+        if $NOT {
+            array = not(&array).unwrap();
         }
+
+        Ok(Arc::new(array))
     }};
 }
 
@@ -429,7 +464,7 @@ impl PhysicalExpr for BinaryExpr {
                 // end-points of its children.
                 Ok(Some(vec![]))
             }
-        } else if self.op.is_comparison_operator() {
+        } else if self.op.supports_propagation() {
             Ok(
                 propagate_comparison(&self.op, interval, left_interval, 
right_interval)?
                     .map(|(left, right)| vec![left, right]),
diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt 
b/datafusion/sqllogictest/test_files/string/large_string.slt
index af6d104e57..8d8a5711bd 100644
--- a/datafusion/sqllogictest/test_files/string/large_string.slt
+++ b/datafusion/sqllogictest/test_files/string/large_string.slt
@@ -59,19 +59,6 @@ Xiangpeng datafusion数据融合 false true false true
 Raphael datafusionДатаФусион false false false false
 NULL NULL NULL NULL NULL NULL
 
-# TODO: move it back to `string_query.slt.part` after fixing the issue
-# https://github.com/apache/datafusion/issues/12618
-query BB
-SELECT
-  ascii_1 ~* '^a.{3}e',
-  unicode_1 ~* '^d.*Фу'
-FROM test_basic_operator;
-----
-true false
-false false
-false true
-NULL NULL
-
 #
 # common test for string-like functions and operators
 #
diff --git a/datafusion/sqllogictest/test_files/string/string.slt 
b/datafusion/sqllogictest/test_files/string/string.slt
index f003e01ecd..e84342abd3 100644
--- a/datafusion/sqllogictest/test_files/string/string.slt
+++ b/datafusion/sqllogictest/test_files/string/string.slt
@@ -34,19 +34,6 @@ statement ok
 create table test_substr as
 select arrow_cast(col1, 'Utf8') as c1 from test_substr_base;
 
-# TODO: move it back to `string_query.slt.part` after fixing the issue
-# https://github.com/apache/datafusion/issues/12618
-query BB
-SELECT
-  ascii_1 ~* '^a.{3}e',
-  unicode_1 ~* '^d.*Фу'
-FROM test_basic_operator;
-----
-true false
-false false
-false true
-NULL NULL
-
 # TODO: move it back to `string_query.slt.part` after fixing the issue
 # see detail: https://github.com/apache/datafusion/issues/12637
 # Test pattern with wildcard characters
diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part 
b/datafusion/sqllogictest/test_files/string/string_query.slt.part
index 6a02296f5e..dc5626b7d5 100644
--- a/datafusion/sqllogictest/test_files/string/string_query.slt.part
+++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part
@@ -642,18 +642,16 @@ true false
 false true
 NULL NULL
 
-# TODO: DictionaryString does not support ~* operator. Enable this after 
fixing the issue
-# see issue: https://github.com/apache/datafusion/issues/12618
-#query BB
-#SELECT
-#  ascii_1 ~* '^a.{3}e',
-#  unicode_1 ~* '^d.*Фу'
-#FROM test_basic_operator;
-#----
-#true false
-#false false
-#false true
-#NULL NULL
+query BB
+SELECT
+  ascii_1 ~* '^a.{3}e',
+  unicode_1 ~* '^d.*Фу'
+FROM test_basic_operator;
+----
+true false
+false false
+false true
+NULL NULL
 
 query BB
 SELECT
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt 
b/datafusion/sqllogictest/test_files/string/string_view.slt
index e01a40586f..19bea3bf6b 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -37,19 +37,6 @@ select arrow_cast(col1, 'Utf8View') as c1 from 
test_substr_base;
 statement ok
 drop table test_source
 
-# TODO: move it back to `string_query.slt.part` after fixing the issue
-# https://github.com/apache/datafusion/issues/12618
-query BB
-SELECT
-  ascii_1 ~* '^a.{3}e',
-  unicode_1 ~* '^d.*Фу'
-FROM test_basic_operator;
-----
-true false
-false false
-false true
-NULL NULL
-
 #
 # common test for string-like functions and operators
 #


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to