This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 0e2023d044 fix: using simple string match replace regex match for
contains udf (#12931)
0e2023d044 is described below
commit 0e2023d044eef862dec210a30051fd8dd9430f00
Author: zhuliquan <[email protected]>
AuthorDate: Thu Oct 17 22:09:54 2024 +0800
fix: using simple string match replace regex match for contains udf (#12931)
* fix: using simple string match replace regex match
* doc: update doc of contains
* test: add case for contains udf
---------
Co-authored-by: zhuliquan <[email protected]>
---
datafusion/functions/src/string/contains.rs | 55 ++++++++++++++++++-----------
1 file changed, 34 insertions(+), 21 deletions(-)
diff --git a/datafusion/functions/src/string/contains.rs
b/datafusion/functions/src/string/contains.rs
index 0f75731aa1..86f1eda033 100644
--- a/datafusion/functions/src/string/contains.rs
+++ b/datafusion/functions/src/string/contains.rs
@@ -16,8 +16,8 @@
// under the License.
use crate::utils::make_scalar_function;
-use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray,
StringViewArray};
-use arrow::compute::regexp_is_match;
+use arrow::array::{Array, ArrayRef, AsArray};
+use arrow::compute::contains as arrow_contains;
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::{Boolean, LargeUtf8, Utf8, Utf8View};
use datafusion_common::exec_err;
@@ -102,40 +102,25 @@ fn get_contains_doc() -> &'static Documentation {
})
}
-/// use regexp_is_match_utf8_scalar to do the calculation for contains
+/// use `arrow::compute::contains` to do the calculation for contains
pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
match (args[0].data_type(), args[1].data_type()) {
(Utf8View, Utf8View) => {
let mod_str = args[0].as_string_view();
let match_str = args[1].as_string_view();
- let res = regexp_is_match::<
- StringViewArray,
- StringViewArray,
- GenericStringArray<i32>,
- >(mod_str, match_str, None)?;
-
+ let res = arrow_contains(mod_str, match_str)?;
Ok(Arc::new(res) as ArrayRef)
}
(Utf8, Utf8) => {
let mod_str = args[0].as_string::<i32>();
let match_str = args[1].as_string::<i32>();
- let res = regexp_is_match::<
- GenericStringArray<i32>,
- GenericStringArray<i32>,
- GenericStringArray<i32>,
- >(mod_str, match_str, None)?;
-
+ let res = arrow_contains(mod_str, match_str)?;
Ok(Arc::new(res) as ArrayRef)
}
(LargeUtf8, LargeUtf8) => {
let mod_str = args[0].as_string::<i64>();
let match_str = args[1].as_string::<i64>();
- let res = regexp_is_match::<
- GenericStringArray<i64>,
- GenericStringArray<i64>,
- GenericStringArray<i32>,
- >(mod_str, match_str, None)?;
-
+ let res = arrow_contains(mod_str, match_str)?;
Ok(Arc::new(res) as ArrayRef)
}
other => {
@@ -143,3 +128,31 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
}
}
}
+
+#[cfg(test)]
+mod test {
+ use super::ContainsFunc;
+ use arrow::array::{BooleanArray, StringArray};
+ use datafusion_common::ScalarValue;
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+ use std::sync::Arc;
+
+ #[test]
+ fn test_contains_udf() {
+ let udf = ContainsFunc::new();
+ let array = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+ Some("xxx?()"),
+ Some("yyy?()"),
+ ])));
+ let scalar =
ColumnarValue::Scalar(ScalarValue::Utf8(Some("x?(".to_string())));
+ let actual = udf.invoke(&[array, scalar]).unwrap();
+ let expect = ColumnarValue::Array(Arc::new(BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ ])));
+ assert_eq!(
+ *actual.into_array(2).unwrap(),
+ *expect.into_array(2).unwrap()
+ );
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]