This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 5188a5d4f3 port regexp_like function and port related tests (#9397)
5188a5d4f3 is described below
commit 5188a5d4f3ce78b891afaf87426b36c6c7ee38d7
Author: Lordworms <[email protected]>
AuthorDate: Mon Mar 4 05:06:35 2024 -0600
port regexp_like function and port related tests (#9397)
* port regexp_like function and port related tests
delete useless test
fix chores
change regexp_like
change lock
change format
fix typo
change prost
remove unused
adding usage
adding dependency
adding dependency
dep
* adding tests
* remove useless
* remove unused
* remove unused
* change dependency structure
* format toml
---
.../core/tests/dataframe/dataframe_functions.rs | 2 +-
datafusion/expr/src/built_in_function.rs | 22 --
datafusion/expr/src/expr_fn.rs | 7 -
datafusion/functions/Cargo.toml | 4 +
.../regexp.rs => functions/benches/regx.rs} | 30 +--
datafusion/functions/src/regex/mod.rs | 13 +-
datafusion/functions/src/regex/regexplike.rs | 252 +++++++++++++++++++++
datafusion/functions/src/regex/regexpmatch.rs | 71 +++++-
datafusion/physical-expr/benches/regexp.rs | 20 +-
datafusion/physical-expr/src/functions.rs | 101 +--------
datafusion/physical-expr/src/regex_expressions.rs | 130 -----------
datafusion/proto/proto/datafusion.proto | 2 +-
datafusion/proto/src/generated/pbjson.rs | 3 -
datafusion/proto/src/generated/prost.rs | 5 +-
datafusion/proto/src/logical_plan/from_proto.rs | 19 +-
datafusion/proto/src/logical_plan/to_proto.rs | 1 -
datafusion/sqllogictest/test_files/regexp.slt | 4 +
17 files changed, 354 insertions(+), 332 deletions(-)
diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs
b/datafusion/core/tests/dataframe/dataframe_functions.rs
index 8bb23e96e0..c857202c23 100644
--- a/datafusion/core/tests/dataframe/dataframe_functions.rs
+++ b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -426,7 +426,7 @@ async fn test_fn_md5() -> Result<()> {
#[tokio::test]
#[cfg(feature = "unicode_expressions")]
async fn test_fn_regexp_like() -> Result<()> {
- let expr = regexp_like(vec![col("a"), lit("[a-z]")]);
+ let expr = regexp_like(col("a"), lit("[a-z]"));
let expected = [
"+-----------------------------------+",
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index c04d867156..6b3f2b956d 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -218,8 +218,6 @@ pub enum BuiltinScalarFunction {
OctetLength,
/// random
Random,
- /// regexp_like
- RegexpLike,
/// regexp_match
/// regexp_replace
RegexpReplace,
@@ -419,7 +417,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::MD5 => Volatility::Immutable,
BuiltinScalarFunction::OctetLength => Volatility::Immutable,
BuiltinScalarFunction::Radians => Volatility::Immutable,
- BuiltinScalarFunction::RegexpLike => Volatility::Immutable,
BuiltinScalarFunction::RegexpReplace => Volatility::Immutable,
BuiltinScalarFunction::Repeat => Volatility::Immutable,
BuiltinScalarFunction::Replace => Volatility::Immutable,
@@ -754,15 +751,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Upper => {
utf8_to_str_type(&input_expr_types[0], "upper")
}
- BuiltinScalarFunction::RegexpLike => Ok(match &input_expr_types[0]
{
- LargeUtf8 | Utf8 => Boolean,
- Null => Null,
- other => {
- return plan_err!(
- "The regexp_like function can only accept strings. Got
{other}"
- );
- }
- }),
BuiltinScalarFunction::Factorial
| BuiltinScalarFunction::Gcd
@@ -1173,15 +1161,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate
=> {
Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])],
self.volatility())
}
- BuiltinScalarFunction::RegexpLike => Signature::one_of(
- vec![
- Exact(vec![Utf8, Utf8]),
- Exact(vec![LargeUtf8, Utf8]),
- Exact(vec![Utf8, Utf8, Utf8]),
- Exact(vec![LargeUtf8, Utf8, Utf8]),
- ],
- self.volatility(),
- ),
BuiltinScalarFunction::RegexpReplace => Signature::one_of(
vec![
Exact(vec![Utf8, Utf8, Utf8]),
@@ -1420,7 +1399,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::FindInSet => &["find_in_set"],
// regex functions
- BuiltinScalarFunction::RegexpLike => &["regexp_like"],
BuiltinScalarFunction::RegexpReplace => &["regexp_replace"],
// time/date functions
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index ba8b76ac6f..ec53fd4ef1 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -822,11 +822,6 @@ nary_scalar_expr!(
rpad,
"fill up a string to the length by appending the characters"
);
-nary_scalar_expr!(
- RegexpLike,
- regexp_like,
- "matches a regular expression against a string and returns true or false
if there was at least one match or not"
-);
nary_scalar_expr!(
RegexpReplace,
regexp_replace,
@@ -1319,8 +1314,6 @@ mod test {
test_scalar_expr!(Ltrim, ltrim, string);
test_scalar_expr!(MD5, md5, string);
test_scalar_expr!(OctetLength, octet_length, string);
- test_nary_scalar_expr!(RegexpLike, regexp_like, string, pattern);
- test_nary_scalar_expr!(RegexpLike, regexp_like, string, pattern,
flags);
test_nary_scalar_expr!(
RegexpReplace,
regexp_replace,
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index e890c9623c..502c692301 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -68,3 +68,7 @@ tokio = { workspace = true, features = ["macros", "rt",
"sync"] }
[[bench]]
harness = false
name = "to_timestamp"
+
+[[bench]]
+harness = false
+name = "regx"
diff --git a/datafusion/physical-expr/benches/regexp.rs
b/datafusion/functions/benches/regx.rs
similarity index 79%
copy from datafusion/physical-expr/benches/regexp.rs
copy to datafusion/functions/benches/regx.rs
index 0371b6bf28..390676f8f2 100644
--- a/datafusion/physical-expr/benches/regexp.rs
+++ b/datafusion/functions/benches/regx.rs
@@ -17,21 +17,17 @@
extern crate criterion;
-use std::iter;
use std::sync::Arc;
use arrow_array::builder::StringBuilder;
use arrow_array::{ArrayRef, StringArray};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_functions::regex::regexplike::regexp_like;
+use datafusion_functions::regex::regexpmatch::regexp_match;
use rand::distributions::Alphanumeric;
use rand::rngs::ThreadRng;
use rand::seq::SliceRandom;
use rand::Rng;
-
-use datafusion_physical_expr::regex_expressions::{
- regexp_like, regexp_match, regexp_replace,
-};
-
fn data(rng: &mut ThreadRng) -> StringArray {
let mut data: Vec<String> = vec![];
for _ in 0..1000 {
@@ -105,28 +101,6 @@ fn criterion_benchmark(c: &mut Criterion) {
)
})
});
-
- c.bench_function("regexp_replace_1000", |b| {
- let mut rng = rand::thread_rng();
- let data = Arc::new(data(&mut rng)) as ArrayRef;
- let regex = Arc::new(regex(&mut rng)) as ArrayRef;
- let flags = Arc::new(flags(&mut rng)) as ArrayRef;
- let replacement =
-
Arc::new(StringArray::from_iter_values(iter::repeat("XX").take(1000)))
- as ArrayRef;
-
- b.iter(|| {
- black_box(
- regexp_replace::<i32>(&[
- data.clone(),
- regex.clone(),
- replacement.clone(),
- flags.clone(),
- ])
- .expect("regexp_replace should work on valid values"),
- )
- })
- });
}
criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/src/regex/mod.rs
b/datafusion/functions/src/regex/mod.rs
index 862e8b77a2..1e0c7799c6 100644
--- a/datafusion/functions/src/regex/mod.rs
+++ b/datafusion/functions/src/regex/mod.rs
@@ -17,13 +17,18 @@
//! "regx" DataFusion functions
-mod regexpmatch;
+pub mod regexplike;
+pub mod regexpmatch;
+
// create UDFs
make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match);
-
+make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like);
export_functions!((
regexp_match,
- input_arg1
- input_arg2,
+ input_arg1 input_arg2,
"returns a list of regular expression matches in a string. "
+),(
+ regexp_like,
+ input_arg1 input_arg2,
+ "Returns true if a has at least one match in a string,false otherwise."
));
diff --git a/datafusion/functions/src/regex/regexplike.rs
b/datafusion/functions/src/regex/regexplike.rs
new file mode 100644
index 0000000000..b0abad3180
--- /dev/null
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regx expressions
+use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
+use arrow::compute::kernels::regexp;
+use arrow::datatypes::DataType;
+use datafusion_common::exec_err;
+use datafusion_common::ScalarValue;
+use datafusion_common::{arrow_datafusion_err, plan_err};
+use datafusion_common::{
+ cast::as_generic_string_array, internal_err, DataFusionError, Result,
+};
+use datafusion_expr::ColumnarValue;
+use datafusion_expr::TypeSignature::*;
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use std::any::Any;
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct RegexpLikeFunc {
+ signature: Signature,
+}
+impl RegexpLikeFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::one_of(
+ vec![
+ Exact(vec![Utf8, Utf8]),
+ Exact(vec![LargeUtf8, Utf8]),
+ Exact(vec![Utf8, Utf8, Utf8]),
+ Exact(vec![LargeUtf8, Utf8, Utf8]),
+ ],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for RegexpLikeFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "regexp_like"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ use DataType::*;
+
+ Ok(match &arg_types[0] {
+ LargeUtf8 | Utf8 => Boolean,
+ Null => Null,
+ other => {
+ return plan_err!(
+ "The regexp_like function can only accept strings. Got
{other}"
+ );
+ }
+ })
+ }
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ let len = args
+ .iter()
+ .fold(Option::<usize>::None, |acc, arg| match arg {
+ ColumnarValue::Scalar(_) => acc,
+ ColumnarValue::Array(a) => Some(a.len()),
+ });
+
+ let is_scalar = len.is_none();
+ let inferred_length = len.unwrap_or(1);
+ let args = args
+ .iter()
+ .map(|arg| arg.clone().into_array(inferred_length))
+ .collect::<Result<Vec<_>>>()?;
+
+ let result = regexp_like_func(&args);
+ if is_scalar {
+ // If all inputs are scalar, keeps output as scalar
+ let result = result.and_then(|arr|
ScalarValue::try_from_array(&arr, 0));
+ result.map(ColumnarValue::Scalar)
+ } else {
+ result.map(ColumnarValue::Array)
+ }
+ }
+}
+fn regexp_like_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+ match args[0].data_type() {
+ DataType::Utf8 => regexp_like::<i32>(args),
+ DataType::LargeUtf8 => regexp_like::<i64>(args),
+ other => {
+ internal_err!("Unsupported data type {other:?} for function
regexp_like")
+ }
+ }
+}
+/// Tests a string using a regular expression returning true if at
+/// least one match, false otherwise.
+///
+/// The full list of supported features and syntax can be found at
+/// <https://docs.rs/regex/latest/regex/#syntax>
+///
+/// Supported flags can be found at
+/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
+///
+/// # Examples
+///
+/// ```ignore
+/// # use datafusion::prelude::*;
+/// # use datafusion::error::Result;
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let ctx = SessionContext::new();
+/// let df = ctx.read_csv("tests/data/regex.csv",
CsvReadOptions::new()).await?;
+///
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' without flags
+/// let df = df.with_column(
+/// "a",
+/// regexp_like(vec![col("values"), col("patterns")])
+/// )?;
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' with flags
+/// let df = df.with_column(
+/// "b",
+/// regexp_like(vec![col("values"), col("patterns"), col("flags")])
+/// )?;
+/// // literals can be used as well with dataframe calls
+/// let df = df.with_column(
+/// "c",
+/// regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
+/// )?;
+///
+/// df.show().await?;
+///
+/// # Ok(())
+/// # }
+/// ```
+pub fn regexp_like<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ match args.len() {
+ 2 => {
+ let values = as_generic_string_array::<T>(&args[0])?;
+ let regex = as_generic_string_array::<T>(&args[1])?;
+ let array = regexp::regexp_is_match_utf8(values, regex, None)
+ .map_err(|e| arrow_datafusion_err!(e))?;
+
+ Ok(Arc::new(array) as ArrayRef)
+ }
+ 3 => {
+ let values = as_generic_string_array::<T>(&args[0])?;
+ let regex = as_generic_string_array::<T>(&args[1])?;
+ let flags = as_generic_string_array::<T>(&args[2])?;
+
+ if flags.iter().any(|s| s == Some("g")) {
+ return plan_err!("regexp_like() does not support the
\"global\" option");
+ }
+
+ let array = regexp::regexp_is_match_utf8(values, regex,
Some(flags))
+ .map_err(|e| arrow_datafusion_err!(e))?;
+
+ Ok(Arc::new(array) as ArrayRef)
+ }
+ other => exec_err!(
+ "regexp_like was called with {other} arguments. It requires at
least 2 and at most 3."
+ ),
+ }
+}
+#[cfg(test)]
+mod tests {
+ use std::sync::Arc;
+
+ use arrow::array::BooleanBuilder;
+ use arrow_array::StringArray;
+
+ use crate::regex::regexplike::regexp_like;
+
+ #[test]
+ fn test_case_sensitive_regexp_like() {
+ let values = StringArray::from(vec!["abc"; 5]);
+
+ let patterns =
+ StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)",
"^(b|c)"]);
+
+ let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
+ expected_builder.append_value(true);
+ expected_builder.append_value(false);
+ expected_builder.append_value(true);
+ expected_builder.append_value(false);
+ expected_builder.append_value(false);
+ let expected = expected_builder.finish();
+
+ let re = regexp_like::<i32>(&[Arc::new(values),
Arc::new(patterns)]).unwrap();
+
+ assert_eq!(re.as_ref(), &expected);
+ }
+
+ #[test]
+ fn test_case_insensitive_regexp_like() {
+ let values = StringArray::from(vec!["abc"; 5]);
+ let patterns =
+ StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)",
"^(b|c)"]);
+ let flags = StringArray::from(vec!["i"; 5]);
+
+ let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
+ expected_builder.append_value(true);
+ expected_builder.append_value(true);
+ expected_builder.append_value(true);
+ expected_builder.append_value(true);
+ expected_builder.append_value(false);
+ let expected = expected_builder.finish();
+
+ let re =
+ regexp_like::<i32>(&[Arc::new(values), Arc::new(patterns),
Arc::new(flags)])
+ .unwrap();
+
+ assert_eq!(re.as_ref(), &expected);
+ }
+
+ #[test]
+ fn test_unsupported_global_flag_regexp_like() {
+ let values = StringArray::from(vec!["abc"]);
+ let patterns = StringArray::from(vec!["^(a)"]);
+ let flags = StringArray::from(vec!["g"]);
+
+ let re_err =
+ regexp_like::<i32>(&[Arc::new(values), Arc::new(patterns),
Arc::new(flags)])
+ .expect_err("unsupported flag should have failed");
+
+ assert_eq!(
+ re_err.strip_backtrace(),
+ "Error during planning: regexp_like() does not support the
\"global\" option"
+ );
+ }
+}
diff --git a/datafusion/functions/src/regex/regexpmatch.rs
b/datafusion/functions/src/regex/regexpmatch.rs
index 8a2180f00b..f34502af35 100644
--- a/datafusion/functions/src/regex/regexpmatch.rs
+++ b/datafusion/functions/src/regex/regexpmatch.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! Encoding expressions
+//! Regx expressions
use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
use arrow::compute::kernels::regexp;
use arrow::datatypes::DataType;
@@ -139,3 +139,72 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef])
-> Result<ArrayRef> {
),
}
}
+#[cfg(test)]
+mod tests {
+ use crate::regex::regexpmatch::regexp_match;
+ use arrow::array::{GenericStringBuilder, ListBuilder};
+ use arrow_array::StringArray;
+ use std::sync::Arc;
+
+ #[test]
+ fn test_case_sensitive_regexp_match() {
+ let values = StringArray::from(vec!["abc"; 5]);
+ let patterns =
+ StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)",
"^(b|c)"]);
+
+ let elem_builder: GenericStringBuilder<i32> =
GenericStringBuilder::new();
+ let mut expected_builder = ListBuilder::new(elem_builder);
+ expected_builder.values().append_value("a");
+ expected_builder.append(true);
+ expected_builder.append(false);
+ expected_builder.values().append_value("b");
+ expected_builder.append(true);
+ expected_builder.append(false);
+ expected_builder.append(false);
+ let expected = expected_builder.finish();
+
+ let re = regexp_match::<i32>(&[Arc::new(values),
Arc::new(patterns)]).unwrap();
+
+ assert_eq!(re.as_ref(), &expected);
+ }
+
+ #[test]
+ fn test_case_insensitive_regexp_match() {
+ let values = StringArray::from(vec!["abc"; 5]);
+ let patterns =
+ StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)",
"^(b|c)"]);
+ let flags = StringArray::from(vec!["i"; 5]);
+
+ let elem_builder: GenericStringBuilder<i32> =
GenericStringBuilder::new();
+ let mut expected_builder = ListBuilder::new(elem_builder);
+ expected_builder.values().append_value("a");
+ expected_builder.append(true);
+ expected_builder.values().append_value("a");
+ expected_builder.append(true);
+ expected_builder.values().append_value("b");
+ expected_builder.append(true);
+ expected_builder.values().append_value("b");
+ expected_builder.append(true);
+ expected_builder.append(false);
+ let expected = expected_builder.finish();
+
+ let re =
+ regexp_match::<i32>(&[Arc::new(values), Arc::new(patterns),
Arc::new(flags)])
+ .unwrap();
+
+ assert_eq!(re.as_ref(), &expected);
+ }
+
+ #[test]
+ fn test_unsupported_global_flag_regexp_match() {
+ let values = StringArray::from(vec!["abc"]);
+ let patterns = StringArray::from(vec!["^(a)"]);
+ let flags = StringArray::from(vec!["g"]);
+
+ let re_err =
+ regexp_match::<i32>(&[Arc::new(values), Arc::new(patterns),
Arc::new(flags)])
+ .expect_err("unsupported flag should have failed");
+
+ assert_eq!(re_err.strip_backtrace(), "Error during planning:
regexp_match() does not support the \"global\" option");
+ }
+}
diff --git a/datafusion/physical-expr/benches/regexp.rs
b/datafusion/physical-expr/benches/regexp.rs
index 0371b6bf28..32acd6ca8f 100644
--- a/datafusion/physical-expr/benches/regexp.rs
+++ b/datafusion/physical-expr/benches/regexp.rs
@@ -23,15 +23,11 @@ use std::sync::Arc;
use arrow_array::builder::StringBuilder;
use arrow_array::{ArrayRef, StringArray};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_physical_expr::regex_expressions::{regexp_match,
regexp_replace};
use rand::distributions::Alphanumeric;
use rand::rngs::ThreadRng;
use rand::seq::SliceRandom;
use rand::Rng;
-
-use datafusion_physical_expr::regex_expressions::{
- regexp_like, regexp_match, regexp_replace,
-};
-
fn data(rng: &mut ThreadRng) -> StringArray {
let mut data: Vec<String> = vec![];
for _ in 0..1000 {
@@ -78,20 +74,6 @@ fn flags(rng: &mut ThreadRng) -> StringArray {
}
fn criterion_benchmark(c: &mut Criterion) {
- c.bench_function("regexp_like_1000", |b| {
- let mut rng = rand::thread_rng();
- let data = Arc::new(data(&mut rng)) as ArrayRef;
- let regex = Arc::new(regex(&mut rng)) as ArrayRef;
- let flags = Arc::new(flags(&mut rng)) as ArrayRef;
-
- b.iter(|| {
- black_box(
- regexp_like::<i32>(&[data.clone(), regex.clone(),
flags.clone()])
- .expect("regexp_like should work on valid values"),
- )
- })
- });
-
c.bench_function("regexp_match_1000", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
diff --git a/datafusion/physical-expr/src/functions.rs
b/datafusion/physical-expr/src/functions.rs
index 14ab25e961..81013882ad 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -105,14 +105,6 @@ macro_rules! invoke_if_crypto_expressions_feature_flag {
};
}
-#[cfg(feature = "regex_expressions")]
-macro_rules! invoke_on_array_if_regex_expressions_feature_flag {
- ($FUNC:ident, $T:tt, $NAME:expr) => {{
- use crate::regex_expressions;
- regex_expressions::$FUNC::<$T>
- }};
-}
-
#[cfg(not(feature = "regex_expressions"))]
macro_rules! invoke_on_array_if_regex_expressions_feature_flag {
($FUNC:ident, $T:tt, $NAME:expr) => {
@@ -560,27 +552,6 @@ pub fn create_physical_fun(
_ => unreachable!(),
},
}),
- BuiltinScalarFunction::RegexpLike => Arc::new(|args| match
args[0].data_type() {
- DataType::Utf8 => {
- let func = invoke_on_array_if_regex_expressions_feature_flag!(
- regexp_like,
- i32,
- "regexp_like"
- );
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func = invoke_on_array_if_regex_expressions_feature_flag!(
- regexp_like,
- i64,
- "regexp_like"
- );
- make_scalar_function_inner(func)(args)
- }
- other => {
- exec_err!("Unsupported data type {other:?} for function
regexp_like")
- }
- }),
BuiltinScalarFunction::RegexpReplace => {
Arc::new(|args| match args[0].data_type() {
DataType::Utf8 => {
@@ -949,8 +920,8 @@ fn func_order_in_one_dimension(
#[cfg(test)]
mod tests {
use super::*;
+ use crate::expressions::lit;
use crate::expressions::try_cast;
- use crate::expressions::{col, lit};
use arrow::{
array::{
Array, ArrayRef, BinaryArray, BooleanArray, Float32Array,
Float64Array,
@@ -959,7 +930,7 @@ mod tests {
datatypes::Field,
record_batch::RecordBatch,
};
- use datafusion_common::cast::{as_boolean_array, as_uint64_array};
+ use datafusion_common::cast::as_uint64_array;
use datafusion_common::{exec_err, internal_err, plan_err};
use datafusion_common::{DataFusionError, Result, ScalarValue};
use datafusion_expr::type_coercion::functions::data_types;
@@ -3044,74 +3015,6 @@ mod tests {
Ok(())
}
- #[test]
- #[cfg(feature = "regex_expressions")]
- fn test_regexp_like() -> Result<()> {
- let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
- let execution_props = ExecutionProps::new();
-
- let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"]));
- let pattern = lit(r".*-(\d*)");
- let columns: Vec<ArrayRef> = vec![col_value];
- let expr = create_physical_expr_with_type_coercion(
- &BuiltinScalarFunction::RegexpLike,
- &[col("a", &schema)?, pattern],
- &schema,
- &execution_props,
- )?;
-
- // type is correct
- assert_eq!(expr.data_type(&schema)?, DataType::Boolean);
-
- // evaluate works
- let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?;
- let result = expr
- .evaluate(&batch)?
- .into_array(batch.num_rows())
- .expect("Failed to convert to array");
-
- let result = as_boolean_array(&result)?;
-
- // value is correct
- assert!(result.value(0));
-
- Ok(())
- }
-
- #[test]
- #[cfg(feature = "regex_expressions")]
- fn test_regexp_like_all_literals() -> Result<()> {
- let schema = Schema::new(vec![Field::new("a", DataType::Int32,
false)]);
- let execution_props = ExecutionProps::new();
-
- let col_value = lit("aaa-555");
- let pattern = lit(r".*-(\d*)");
- let columns: Vec<ArrayRef> = vec![Arc::new(Int32Array::from(vec![1]))];
- let expr = create_physical_expr_with_type_coercion(
- &BuiltinScalarFunction::RegexpLike,
- &[col_value, pattern],
- &schema,
- &execution_props,
- )?;
-
- // type is correct
- assert_eq!(expr.data_type(&schema)?, DataType::Boolean);
-
- // evaluate works
- let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?;
- let result = expr
- .evaluate(&batch)?
- .into_array(batch.num_rows())
- .expect("Failed to convert to array");
-
- let result = as_boolean_array(&result)?;
-
- // value is correct
- assert!(result.value(0));
-
- Ok(())
- }
-
// Helper function just for testing.
// Returns `expressions` coerced to types compatible with
// `signature`, if possible.
diff --git a/datafusion/physical-expr/src/regex_expressions.rs
b/datafusion/physical-expr/src/regex_expressions.rs
index 846e5801af..99e6597dad 100644
--- a/datafusion/physical-expr/src/regex_expressions.rs
+++ b/datafusion/physical-expr/src/regex_expressions.rs
@@ -53,78 +53,6 @@ macro_rules! fetch_string_arg {
}};
}
-/// Tests a string using a regular expression returning true if at
-/// least one match, false otherwise.
-///
-/// The full list of supported features and syntax can be found at
-/// <https://docs.rs/regex/latest/regex/#syntax>
-///
-/// Supported flags can be found at
-/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
-///
-/// # Examples
-///
-/// ```ignore
-/// # use datafusion::prelude::*;
-/// # use datafusion::error::Result;
-/// # #[tokio::main]
-/// # async fn main() -> Result<()> {
-/// let ctx = SessionContext::new();
-/// let df = ctx.read_csv("tests/data/regex.csv",
CsvReadOptions::new()).await?;
-///
-/// // use the regexp_like function to test col 'values',
-/// // against patterns in col 'patterns' without flags
-/// let df = df.with_column(
-/// "a",
-/// regexp_like(vec![col("values"), col("patterns")])
-/// )?;
-/// // use the regexp_like function to test col 'values',
-/// // against patterns in col 'patterns' with flags
-/// let df = df.with_column(
-/// "b",
-/// regexp_like(vec![col("values"), col("patterns"), col("flags")])
-/// )?;
-/// // literals can be used as well with dataframe calls
-/// let df = df.with_column(
-/// "c",
-/// regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
-/// )?;
-///
-/// df.show().await?;
-///
-/// # Ok(())
-/// # }
-/// ```
-pub fn regexp_like<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- match args.len() {
- 2 => {
- let values = as_generic_string_array::<T>(&args[0])?;
- let regex = as_generic_string_array::<T>(&args[1])?;
- let array = arrow_string::regexp::regexp_is_match_utf8(values,
regex, None)
- .map_err(|e| arrow_datafusion_err!(e))?;
-
- Ok(Arc::new(array) as ArrayRef)
- }
- 3 => {
- let values = as_generic_string_array::<T>(&args[0])?;
- let regex = as_generic_string_array::<T>(&args[1])?;
- let flags = as_generic_string_array::<T>(&args[2])?;
-
- if flags.iter().any(|s| s == Some("g")) {
- return plan_err!("regexp_like() does not support the
\"global\" option");
- }
-
- let array = arrow_string::regexp::regexp_is_match_utf8(values,
regex, Some(flags))
- .map_err(|e| arrow_datafusion_err!(e))?;
-
- Ok(Arc::new(array) as ArrayRef)
- }
- other => exec_err!(
- "regexp_like was called with {other} arguments. It requires at
least 2 and at most 3."
- ),
- }
-}
-
/// Extract a specific group from a string column, using a regular expression.
///
/// The full list of supported features and syntax can be found at
@@ -487,64 +415,6 @@ mod tests {
use super::*;
- #[test]
- fn test_case_sensitive_regexp_like() {
- let values = StringArray::from(vec!["abc"; 5]);
-
- let patterns =
- StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)",
"^(b|c)"]);
-
- let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
- expected_builder.append_value(true);
- expected_builder.append_value(false);
- expected_builder.append_value(true);
- expected_builder.append_value(false);
- expected_builder.append_value(false);
- let expected = expected_builder.finish();
-
- let re = regexp_like::<i32>(&[Arc::new(values),
Arc::new(patterns)]).unwrap();
-
- assert_eq!(re.as_ref(), &expected);
- }
-
- #[test]
- fn test_case_insensitive_regexp_like() {
- let values = StringArray::from(vec!["abc"; 5]);
- let patterns =
- StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)",
"^(b|c)"]);
- let flags = StringArray::from(vec!["i"; 5]);
-
- let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
- expected_builder.append_value(true);
- expected_builder.append_value(true);
- expected_builder.append_value(true);
- expected_builder.append_value(true);
- expected_builder.append_value(false);
- let expected = expected_builder.finish();
-
- let re =
- regexp_like::<i32>(&[Arc::new(values), Arc::new(patterns),
Arc::new(flags)])
- .unwrap();
-
- assert_eq!(re.as_ref(), &expected);
- }
-
- #[test]
- fn test_unsupported_global_flag_regexp_like() {
- let values = StringArray::from(vec!["abc"]);
- let patterns = StringArray::from(vec!["^(a)"]);
- let flags = StringArray::from(vec!["g"]);
-
- let re_err =
- regexp_like::<i32>(&[Arc::new(values), Arc::new(patterns),
Arc::new(flags)])
- .expect_err("unsupported flag should have failed");
-
- assert_eq!(
- re_err.strip_backtrace(),
- "Error during planning: regexp_like() does not support the
\"global\" option"
- );
- }
-
#[test]
fn test_case_sensitive_regexp_match() {
let values = StringArray::from(vec!["abc"; 5]);
diff --git a/datafusion/proto/proto/datafusion.proto
b/datafusion/proto/proto/datafusion.proto
index 526ee7704b..c47b9abadb 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -681,7 +681,7 @@ enum ScalarFunction {
/// 132 was InStr
MakeDate = 133;
ArrayReverse = 134;
- RegexpLike = 135;
+ /// 135 is RegexpLike
ToChar = 136;
/// 137 was ToDate
}
diff --git a/datafusion/proto/src/generated/pbjson.rs
b/datafusion/proto/src/generated/pbjson.rs
index 56f0880074..c9be1bb7f3 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22438,7 +22438,6 @@ impl serde::Serialize for ScalarFunction {
Self::EndsWith => "EndsWith",
Self::MakeDate => "MakeDate",
Self::ArrayReverse => "ArrayReverse",
- Self::RegexpLike => "RegexpLike",
Self::ToChar => "ToChar",
};
serializer.serialize_str(variant)
@@ -22568,7 +22567,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"EndsWith",
"MakeDate",
"ArrayReverse",
- "RegexpLike",
"ToChar",
];
@@ -22727,7 +22725,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"EndsWith" => Ok(ScalarFunction::EndsWith),
"MakeDate" => Ok(ScalarFunction::MakeDate),
"ArrayReverse" => Ok(ScalarFunction::ArrayReverse),
- "RegexpLike" => Ok(ScalarFunction::RegexpLike),
"ToChar" => Ok(ScalarFunction::ToChar),
_ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
}
diff --git a/datafusion/proto/src/generated/prost.rs
b/datafusion/proto/src/generated/prost.rs
index 187085454a..4d19b79a3b 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2770,7 +2770,8 @@ pub enum ScalarFunction {
/// / 132 was InStr
MakeDate = 133,
ArrayReverse = 134,
- RegexpLike = 135,
+ /// / 135 is RegexpLike
+ ///
/// / 137 was ToDate
ToChar = 136,
}
@@ -2898,7 +2899,6 @@ impl ScalarFunction {
ScalarFunction::EndsWith => "EndsWith",
ScalarFunction::MakeDate => "MakeDate",
ScalarFunction::ArrayReverse => "ArrayReverse",
- ScalarFunction::RegexpLike => "RegexpLike",
ScalarFunction::ToChar => "ToChar",
}
}
@@ -3022,7 +3022,6 @@ impl ScalarFunction {
"EndsWith" => Some(Self::EndsWith),
"MakeDate" => Some(Self::MakeDate),
"ArrayReverse" => Some(Self::ArrayReverse),
- "RegexpLike" => Some(Self::RegexpLike),
"ToChar" => Some(Self::ToChar),
_ => None,
}
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs
b/datafusion/proto/src/logical_plan/from_proto.rs
index 327902c98b..aee53849c8 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -61,12 +61,12 @@ use datafusion_expr::{
left, levenshtein, ln, log, log10, log2,
logical_plan::{PlanType, StringifiedPlan},
lower, lpad, ltrim, md5, nanvl, now, octet_length, overlay, pi, power,
radians,
- random, regexp_like, regexp_replace, repeat, replace, reverse, right,
round, rpad,
- rtrim, sha224, sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt,
- starts_with, string_to_array, strpos, struct_fun, substr, substr_index,
substring,
- tan, tanh, to_hex, translate, trim, trunc, upper, uuid, AggregateFunction,
Between,
- BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr,
- GetFieldAccess, GetIndexedField, GroupingSet,
+ random, regexp_replace, repeat, replace, reverse, right, round, rpad,
rtrim, sha224,
+ sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt, starts_with,
+ string_to_array, strpos, struct_fun, substr, substr_index, substring, tan,
tanh,
+ to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, Between,
BinaryExpr,
+ BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr,
GetFieldAccess,
+ GetIndexedField, GroupingSet,
GroupingSet::GroupingSets,
JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame,
WindowFrameBound,
WindowFrameUnits,
@@ -530,7 +530,6 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::Left => Self::Left,
ScalarFunction::Lpad => Self::Lpad,
ScalarFunction::Random => Self::Random,
- ScalarFunction::RegexpLike => Self::RegexpLike,
ScalarFunction::RegexpReplace => Self::RegexpReplace,
ScalarFunction::Repeat => Self::Repeat,
ScalarFunction::Replace => Self::Replace,
@@ -1704,12 +1703,6 @@ pub fn parse_expr(
.map(|expr| parse_expr(expr, registry, codec))
.collect::<Result<Vec<_>, _>>()?,
)),
- ScalarFunction::RegexpLike => Ok(regexp_like(
- args.to_owned()
- .iter()
- .map(|expr| parse_expr(expr, registry, codec))
- .collect::<Result<Vec<_>, _>>()?,
- )),
ScalarFunction::RegexpReplace => Ok(regexp_replace(
args.to_owned()
.iter()
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs
b/datafusion/proto/src/logical_plan/to_proto.rs
index ad618790a5..a4e9fd423b 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1508,7 +1508,6 @@ impl TryFrom<&BuiltinScalarFunction> for
protobuf::ScalarFunction {
BuiltinScalarFunction::Lpad => Self::Lpad,
BuiltinScalarFunction::Random => Self::Random,
BuiltinScalarFunction::Uuid => Self::Uuid,
- BuiltinScalarFunction::RegexpLike => Self::RegexpLike,
BuiltinScalarFunction::RegexpReplace => Self::RegexpReplace,
BuiltinScalarFunction::Repeat => Self::Repeat,
BuiltinScalarFunction::Replace => Self::Replace,
diff --git a/datafusion/sqllogictest/test_files/regexp.slt
b/datafusion/sqllogictest/test_files/regexp.slt
index a80b08c41e..19966be209 100644
--- a/datafusion/sqllogictest/test_files/regexp.slt
+++ b/datafusion/sqllogictest/test_files/regexp.slt
@@ -124,6 +124,10 @@ SELECT regexp_like('(?<=[A-Z]\w )Smith', 'John Smith',
'i');
----
false
+query B
+select regexp_like('aaa-555', '.*-(\d*)');
+----
+true
#
# regexp_match tests