This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 5188a5d4f3 port regexp_like function and port related tests (#9397)
5188a5d4f3 is described below

commit 5188a5d4f3ce78b891afaf87426b36c6c7ee38d7
Author: Lordworms <[email protected]>
AuthorDate: Mon Mar 4 05:06:35 2024 -0600

    port regexp_like function and port related tests (#9397)
    
    * port regexp_like function and port related tests
    
    delete useless test
    
    fix chores
    
    change regexp_like
    
    change lock
    
    change format
    
    fix typo
    
    change prost
    
    remove unused
    
    adding usage
    
    adding dependency
    
    adding dependency
    
    dep
    
    * adding tests
    
    * remove useless
    
    * remove unused
    
    * remove unused
    
    * change dependency structure
    
    * format toml
---
 .../core/tests/dataframe/dataframe_functions.rs    |   2 +-
 datafusion/expr/src/built_in_function.rs           |  22 --
 datafusion/expr/src/expr_fn.rs                     |   7 -
 datafusion/functions/Cargo.toml                    |   4 +
 .../regexp.rs => functions/benches/regx.rs}        |  30 +--
 datafusion/functions/src/regex/mod.rs              |  13 +-
 datafusion/functions/src/regex/regexplike.rs       | 252 +++++++++++++++++++++
 datafusion/functions/src/regex/regexpmatch.rs      |  71 +++++-
 datafusion/physical-expr/benches/regexp.rs         |  20 +-
 datafusion/physical-expr/src/functions.rs          | 101 +--------
 datafusion/physical-expr/src/regex_expressions.rs  | 130 -----------
 datafusion/proto/proto/datafusion.proto            |   2 +-
 datafusion/proto/src/generated/pbjson.rs           |   3 -
 datafusion/proto/src/generated/prost.rs            |   5 +-
 datafusion/proto/src/logical_plan/from_proto.rs    |  19 +-
 datafusion/proto/src/logical_plan/to_proto.rs      |   1 -
 datafusion/sqllogictest/test_files/regexp.slt      |   4 +
 17 files changed, 354 insertions(+), 332 deletions(-)

diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs 
b/datafusion/core/tests/dataframe/dataframe_functions.rs
index 8bb23e96e0..c857202c23 100644
--- a/datafusion/core/tests/dataframe/dataframe_functions.rs
+++ b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -426,7 +426,7 @@ async fn test_fn_md5() -> Result<()> {
 #[tokio::test]
 #[cfg(feature = "unicode_expressions")]
 async fn test_fn_regexp_like() -> Result<()> {
-    let expr = regexp_like(vec![col("a"), lit("[a-z]")]);
+    let expr = regexp_like(col("a"), lit("[a-z]"));
 
     let expected = [
         "+-----------------------------------+",
diff --git a/datafusion/expr/src/built_in_function.rs 
b/datafusion/expr/src/built_in_function.rs
index c04d867156..6b3f2b956d 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -218,8 +218,6 @@ pub enum BuiltinScalarFunction {
     OctetLength,
     /// random
     Random,
-    /// regexp_like
-    RegexpLike,
     /// regexp_match
     /// regexp_replace
     RegexpReplace,
@@ -419,7 +417,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::MD5 => Volatility::Immutable,
             BuiltinScalarFunction::OctetLength => Volatility::Immutable,
             BuiltinScalarFunction::Radians => Volatility::Immutable,
-            BuiltinScalarFunction::RegexpLike => Volatility::Immutable,
             BuiltinScalarFunction::RegexpReplace => Volatility::Immutable,
             BuiltinScalarFunction::Repeat => Volatility::Immutable,
             BuiltinScalarFunction::Replace => Volatility::Immutable,
@@ -754,15 +751,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::Upper => {
                 utf8_to_str_type(&input_expr_types[0], "upper")
             }
-            BuiltinScalarFunction::RegexpLike => Ok(match &input_expr_types[0] 
{
-                LargeUtf8 | Utf8 => Boolean,
-                Null => Null,
-                other => {
-                    return plan_err!(
-                        "The regexp_like function can only accept strings. Got 
{other}"
-                    );
-                }
-            }),
 
             BuiltinScalarFunction::Factorial
             | BuiltinScalarFunction::Gcd
@@ -1173,15 +1161,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate 
=> {
                 Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], 
self.volatility())
             }
-            BuiltinScalarFunction::RegexpLike => Signature::one_of(
-                vec![
-                    Exact(vec![Utf8, Utf8]),
-                    Exact(vec![LargeUtf8, Utf8]),
-                    Exact(vec![Utf8, Utf8, Utf8]),
-                    Exact(vec![LargeUtf8, Utf8, Utf8]),
-                ],
-                self.volatility(),
-            ),
             BuiltinScalarFunction::RegexpReplace => Signature::one_of(
                 vec![
                     Exact(vec![Utf8, Utf8, Utf8]),
@@ -1420,7 +1399,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::FindInSet => &["find_in_set"],
 
             // regex functions
-            BuiltinScalarFunction::RegexpLike => &["regexp_like"],
             BuiltinScalarFunction::RegexpReplace => &["regexp_replace"],
 
             // time/date functions
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index ba8b76ac6f..ec53fd4ef1 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -822,11 +822,6 @@ nary_scalar_expr!(
     rpad,
     "fill up a string to the length by appending the characters"
 );
-nary_scalar_expr!(
-    RegexpLike,
-    regexp_like,
-    "matches a regular expression against a string and returns true or false 
if there was at least one match or not"
-);
 nary_scalar_expr!(
     RegexpReplace,
     regexp_replace,
@@ -1319,8 +1314,6 @@ mod test {
         test_scalar_expr!(Ltrim, ltrim, string);
         test_scalar_expr!(MD5, md5, string);
         test_scalar_expr!(OctetLength, octet_length, string);
-        test_nary_scalar_expr!(RegexpLike, regexp_like, string, pattern);
-        test_nary_scalar_expr!(RegexpLike, regexp_like, string, pattern, 
flags);
         test_nary_scalar_expr!(
             RegexpReplace,
             regexp_replace,
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index e890c9623c..502c692301 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -68,3 +68,7 @@ tokio = { workspace = true, features = ["macros", "rt", 
"sync"] }
 [[bench]]
 harness = false
 name = "to_timestamp"
+
+[[bench]]
+harness = false
+name = "regx"
diff --git a/datafusion/physical-expr/benches/regexp.rs 
b/datafusion/functions/benches/regx.rs
similarity index 79%
copy from datafusion/physical-expr/benches/regexp.rs
copy to datafusion/functions/benches/regx.rs
index 0371b6bf28..390676f8f2 100644
--- a/datafusion/physical-expr/benches/regexp.rs
+++ b/datafusion/functions/benches/regx.rs
@@ -17,21 +17,17 @@
 
 extern crate criterion;
 
-use std::iter;
 use std::sync::Arc;
 
 use arrow_array::builder::StringBuilder;
 use arrow_array::{ArrayRef, StringArray};
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_functions::regex::regexplike::regexp_like;
+use datafusion_functions::regex::regexpmatch::regexp_match;
 use rand::distributions::Alphanumeric;
 use rand::rngs::ThreadRng;
 use rand::seq::SliceRandom;
 use rand::Rng;
-
-use datafusion_physical_expr::regex_expressions::{
-    regexp_like, regexp_match, regexp_replace,
-};
-
 fn data(rng: &mut ThreadRng) -> StringArray {
     let mut data: Vec<String> = vec![];
     for _ in 0..1000 {
@@ -105,28 +101,6 @@ fn criterion_benchmark(c: &mut Criterion) {
             )
         })
     });
-
-    c.bench_function("regexp_replace_1000", |b| {
-        let mut rng = rand::thread_rng();
-        let data = Arc::new(data(&mut rng)) as ArrayRef;
-        let regex = Arc::new(regex(&mut rng)) as ArrayRef;
-        let flags = Arc::new(flags(&mut rng)) as ArrayRef;
-        let replacement =
-            
Arc::new(StringArray::from_iter_values(iter::repeat("XX").take(1000)))
-                as ArrayRef;
-
-        b.iter(|| {
-            black_box(
-                regexp_replace::<i32>(&[
-                    data.clone(),
-                    regex.clone(),
-                    replacement.clone(),
-                    flags.clone(),
-                ])
-                .expect("regexp_replace should work on valid values"),
-            )
-        })
-    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/src/regex/mod.rs 
b/datafusion/functions/src/regex/mod.rs
index 862e8b77a2..1e0c7799c6 100644
--- a/datafusion/functions/src/regex/mod.rs
+++ b/datafusion/functions/src/regex/mod.rs
@@ -17,13 +17,18 @@
 
 //! "regx" DataFusion functions
 
-mod regexpmatch;
+pub mod regexplike;
+pub mod regexpmatch;
+
 // create UDFs
 make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match);
-
+make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like);
 export_functions!((
     regexp_match,
-    input_arg1
-    input_arg2,
+    input_arg1 input_arg2,
     "returns a list of regular expression matches in a string. "
+),(
+    regexp_like,
+    input_arg1 input_arg2,
+    "Returns true if a has at least one match in a string,false otherwise."
 ));
diff --git a/datafusion/functions/src/regex/regexplike.rs 
b/datafusion/functions/src/regex/regexplike.rs
new file mode 100644
index 0000000000..b0abad3180
--- /dev/null
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regx expressions
+use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
+use arrow::compute::kernels::regexp;
+use arrow::datatypes::DataType;
+use datafusion_common::exec_err;
+use datafusion_common::ScalarValue;
+use datafusion_common::{arrow_datafusion_err, plan_err};
+use datafusion_common::{
+    cast::as_generic_string_array, internal_err, DataFusionError, Result,
+};
+use datafusion_expr::ColumnarValue;
+use datafusion_expr::TypeSignature::*;
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use std::any::Any;
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct RegexpLikeFunc {
+    signature: Signature,
+}
+impl RegexpLikeFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Exact(vec![Utf8, Utf8]),
+                    Exact(vec![LargeUtf8, Utf8]),
+                    Exact(vec![Utf8, Utf8, Utf8]),
+                    Exact(vec![LargeUtf8, Utf8, Utf8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpLikeFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_like"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;
+
+        Ok(match &arg_types[0] {
+            LargeUtf8 | Utf8 => Boolean,
+            Null => Null,
+            other => {
+                return plan_err!(
+                    "The regexp_like function can only accept strings. Got 
{other}"
+                );
+            }
+        })
+    }
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+        let inferred_length = len.unwrap_or(1);
+        let args = args
+            .iter()
+            .map(|arg| arg.clone().into_array(inferred_length))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = regexp_like_func(&args);
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+}
+fn regexp_like_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+    match args[0].data_type() {
+        DataType::Utf8 => regexp_like::<i32>(args),
+        DataType::LargeUtf8 => regexp_like::<i64>(args),
+        other => {
+            internal_err!("Unsupported data type {other:?} for function 
regexp_like")
+        }
+    }
+}
+/// Tests a string using a regular expression returning true if at
+/// least one match, false otherwise.
+///
+/// The full list of supported features and syntax can be found at
+/// <https://docs.rs/regex/latest/regex/#syntax>
+///
+/// Supported flags can be found at
+/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
+///
+/// # Examples
+///
+/// ```ignore
+/// # use datafusion::prelude::*;
+/// # use datafusion::error::Result;
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let ctx = SessionContext::new();
+/// let df = ctx.read_csv("tests/data/regex.csv", 
CsvReadOptions::new()).await?;
+///
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' without flags
+/// let df = df.with_column(
+///     "a",
+///     regexp_like(vec![col("values"), col("patterns")])
+/// )?;
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' with flags
+/// let df = df.with_column(
+///     "b",
+///     regexp_like(vec![col("values"), col("patterns"), col("flags")])
+/// )?;
+/// // literals can be used as well with dataframe calls
+/// let df = df.with_column(
+///     "c",
+///     regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
+/// )?;
+///
+/// df.show().await?;
+///
+/// # Ok(())
+/// # }
+/// ```
+pub fn regexp_like<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+    match args.len() {
+        2 => {
+            let values = as_generic_string_array::<T>(&args[0])?;
+            let regex = as_generic_string_array::<T>(&args[1])?;
+            let array = regexp::regexp_is_match_utf8(values, regex, None)
+                .map_err(|e| arrow_datafusion_err!(e))?;
+
+            Ok(Arc::new(array) as ArrayRef)
+        }
+        3 => {
+            let values = as_generic_string_array::<T>(&args[0])?;
+            let regex = as_generic_string_array::<T>(&args[1])?;
+            let flags = as_generic_string_array::<T>(&args[2])?;
+
+            if flags.iter().any(|s| s == Some("g")) {
+                return plan_err!("regexp_like() does not support the 
\"global\" option");
+            }
+
+            let array = regexp::regexp_is_match_utf8(values, regex, 
Some(flags))
+                .map_err(|e| arrow_datafusion_err!(e))?;
+
+            Ok(Arc::new(array) as ArrayRef)
+        }
+        other => exec_err!(
+            "regexp_like was called with {other} arguments. It requires at 
least 2 and at most 3."
+        ),
+    }
+}
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::BooleanBuilder;
+    use arrow_array::StringArray;
+
+    use crate::regex::regexplike::regexp_like;
+
+    #[test]
+    fn test_case_sensitive_regexp_like() {
+        let values = StringArray::from(vec!["abc"; 5]);
+
+        let patterns =
+            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", 
"^(b|c)"]);
+
+        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
+        expected_builder.append_value(true);
+        expected_builder.append_value(false);
+        expected_builder.append_value(true);
+        expected_builder.append_value(false);
+        expected_builder.append_value(false);
+        let expected = expected_builder.finish();
+
+        let re = regexp_like::<i32>(&[Arc::new(values), 
Arc::new(patterns)]).unwrap();
+
+        assert_eq!(re.as_ref(), &expected);
+    }
+
+    #[test]
+    fn test_case_insensitive_regexp_like() {
+        let values = StringArray::from(vec!["abc"; 5]);
+        let patterns =
+            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", 
"^(b|c)"]);
+        let flags = StringArray::from(vec!["i"; 5]);
+
+        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
+        expected_builder.append_value(true);
+        expected_builder.append_value(true);
+        expected_builder.append_value(true);
+        expected_builder.append_value(true);
+        expected_builder.append_value(false);
+        let expected = expected_builder.finish();
+
+        let re =
+            regexp_like::<i32>(&[Arc::new(values), Arc::new(patterns), 
Arc::new(flags)])
+                .unwrap();
+
+        assert_eq!(re.as_ref(), &expected);
+    }
+
+    #[test]
+    fn test_unsupported_global_flag_regexp_like() {
+        let values = StringArray::from(vec!["abc"]);
+        let patterns = StringArray::from(vec!["^(a)"]);
+        let flags = StringArray::from(vec!["g"]);
+
+        let re_err =
+            regexp_like::<i32>(&[Arc::new(values), Arc::new(patterns), 
Arc::new(flags)])
+                .expect_err("unsupported flag should have failed");
+
+        assert_eq!(
+            re_err.strip_backtrace(),
+            "Error during planning: regexp_like() does not support the 
\"global\" option"
+        );
+    }
+}
diff --git a/datafusion/functions/src/regex/regexpmatch.rs 
b/datafusion/functions/src/regex/regexpmatch.rs
index 8a2180f00b..f34502af35 100644
--- a/datafusion/functions/src/regex/regexpmatch.rs
+++ b/datafusion/functions/src/regex/regexpmatch.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Encoding expressions
+//! Regx expressions
 use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
 use arrow::compute::kernels::regexp;
 use arrow::datatypes::DataType;
@@ -139,3 +139,72 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) 
-> Result<ArrayRef> {
         ),
     }
 }
+#[cfg(test)]
+mod tests {
+    use crate::regex::regexpmatch::regexp_match;
+    use arrow::array::{GenericStringBuilder, ListBuilder};
+    use arrow_array::StringArray;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_case_sensitive_regexp_match() {
+        let values = StringArray::from(vec!["abc"; 5]);
+        let patterns =
+            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", 
"^(b|c)"]);
+
+        let elem_builder: GenericStringBuilder<i32> = 
GenericStringBuilder::new();
+        let mut expected_builder = ListBuilder::new(elem_builder);
+        expected_builder.values().append_value("a");
+        expected_builder.append(true);
+        expected_builder.append(false);
+        expected_builder.values().append_value("b");
+        expected_builder.append(true);
+        expected_builder.append(false);
+        expected_builder.append(false);
+        let expected = expected_builder.finish();
+
+        let re = regexp_match::<i32>(&[Arc::new(values), 
Arc::new(patterns)]).unwrap();
+
+        assert_eq!(re.as_ref(), &expected);
+    }
+
+    #[test]
+    fn test_case_insensitive_regexp_match() {
+        let values = StringArray::from(vec!["abc"; 5]);
+        let patterns =
+            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", 
"^(b|c)"]);
+        let flags = StringArray::from(vec!["i"; 5]);
+
+        let elem_builder: GenericStringBuilder<i32> = 
GenericStringBuilder::new();
+        let mut expected_builder = ListBuilder::new(elem_builder);
+        expected_builder.values().append_value("a");
+        expected_builder.append(true);
+        expected_builder.values().append_value("a");
+        expected_builder.append(true);
+        expected_builder.values().append_value("b");
+        expected_builder.append(true);
+        expected_builder.values().append_value("b");
+        expected_builder.append(true);
+        expected_builder.append(false);
+        let expected = expected_builder.finish();
+
+        let re =
+            regexp_match::<i32>(&[Arc::new(values), Arc::new(patterns), 
Arc::new(flags)])
+                .unwrap();
+
+        assert_eq!(re.as_ref(), &expected);
+    }
+
+    #[test]
+    fn test_unsupported_global_flag_regexp_match() {
+        let values = StringArray::from(vec!["abc"]);
+        let patterns = StringArray::from(vec!["^(a)"]);
+        let flags = StringArray::from(vec!["g"]);
+
+        let re_err =
+            regexp_match::<i32>(&[Arc::new(values), Arc::new(patterns), 
Arc::new(flags)])
+                .expect_err("unsupported flag should have failed");
+
+        assert_eq!(re_err.strip_backtrace(), "Error during planning: 
regexp_match() does not support the \"global\" option");
+    }
+}
diff --git a/datafusion/physical-expr/benches/regexp.rs 
b/datafusion/physical-expr/benches/regexp.rs
index 0371b6bf28..32acd6ca8f 100644
--- a/datafusion/physical-expr/benches/regexp.rs
+++ b/datafusion/physical-expr/benches/regexp.rs
@@ -23,15 +23,11 @@ use std::sync::Arc;
 use arrow_array::builder::StringBuilder;
 use arrow_array::{ArrayRef, StringArray};
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_physical_expr::regex_expressions::{regexp_match, 
regexp_replace};
 use rand::distributions::Alphanumeric;
 use rand::rngs::ThreadRng;
 use rand::seq::SliceRandom;
 use rand::Rng;
-
-use datafusion_physical_expr::regex_expressions::{
-    regexp_like, regexp_match, regexp_replace,
-};
-
 fn data(rng: &mut ThreadRng) -> StringArray {
     let mut data: Vec<String> = vec![];
     for _ in 0..1000 {
@@ -78,20 +74,6 @@ fn flags(rng: &mut ThreadRng) -> StringArray {
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
-    c.bench_function("regexp_like_1000", |b| {
-        let mut rng = rand::thread_rng();
-        let data = Arc::new(data(&mut rng)) as ArrayRef;
-        let regex = Arc::new(regex(&mut rng)) as ArrayRef;
-        let flags = Arc::new(flags(&mut rng)) as ArrayRef;
-
-        b.iter(|| {
-            black_box(
-                regexp_like::<i32>(&[data.clone(), regex.clone(), 
flags.clone()])
-                    .expect("regexp_like should work on valid values"),
-            )
-        })
-    });
-
     c.bench_function("regexp_match_1000", |b| {
         let mut rng = rand::thread_rng();
         let data = Arc::new(data(&mut rng)) as ArrayRef;
diff --git a/datafusion/physical-expr/src/functions.rs 
b/datafusion/physical-expr/src/functions.rs
index 14ab25e961..81013882ad 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -105,14 +105,6 @@ macro_rules! invoke_if_crypto_expressions_feature_flag {
     };
 }
 
-#[cfg(feature = "regex_expressions")]
-macro_rules! invoke_on_array_if_regex_expressions_feature_flag {
-    ($FUNC:ident, $T:tt, $NAME:expr) => {{
-        use crate::regex_expressions;
-        regex_expressions::$FUNC::<$T>
-    }};
-}
-
 #[cfg(not(feature = "regex_expressions"))]
 macro_rules! invoke_on_array_if_regex_expressions_feature_flag {
     ($FUNC:ident, $T:tt, $NAME:expr) => {
@@ -560,27 +552,6 @@ pub fn create_physical_fun(
                 _ => unreachable!(),
             },
         }),
-        BuiltinScalarFunction::RegexpLike => Arc::new(|args| match 
args[0].data_type() {
-            DataType::Utf8 => {
-                let func = invoke_on_array_if_regex_expressions_feature_flag!(
-                    regexp_like,
-                    i32,
-                    "regexp_like"
-                );
-                make_scalar_function_inner(func)(args)
-            }
-            DataType::LargeUtf8 => {
-                let func = invoke_on_array_if_regex_expressions_feature_flag!(
-                    regexp_like,
-                    i64,
-                    "regexp_like"
-                );
-                make_scalar_function_inner(func)(args)
-            }
-            other => {
-                exec_err!("Unsupported data type {other:?} for function 
regexp_like")
-            }
-        }),
         BuiltinScalarFunction::RegexpReplace => {
             Arc::new(|args| match args[0].data_type() {
                 DataType::Utf8 => {
@@ -949,8 +920,8 @@ fn func_order_in_one_dimension(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::expressions::lit;
     use crate::expressions::try_cast;
-    use crate::expressions::{col, lit};
     use arrow::{
         array::{
             Array, ArrayRef, BinaryArray, BooleanArray, Float32Array, 
Float64Array,
@@ -959,7 +930,7 @@ mod tests {
         datatypes::Field,
         record_batch::RecordBatch,
     };
-    use datafusion_common::cast::{as_boolean_array, as_uint64_array};
+    use datafusion_common::cast::as_uint64_array;
     use datafusion_common::{exec_err, internal_err, plan_err};
     use datafusion_common::{DataFusionError, Result, ScalarValue};
     use datafusion_expr::type_coercion::functions::data_types;
@@ -3044,74 +3015,6 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    #[cfg(feature = "regex_expressions")]
-    fn test_regexp_like() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
-        let execution_props = ExecutionProps::new();
-
-        let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"]));
-        let pattern = lit(r".*-(\d*)");
-        let columns: Vec<ArrayRef> = vec![col_value];
-        let expr = create_physical_expr_with_type_coercion(
-            &BuiltinScalarFunction::RegexpLike,
-            &[col("a", &schema)?, pattern],
-            &schema,
-            &execution_props,
-        )?;
-
-        // type is correct
-        assert_eq!(expr.data_type(&schema)?, DataType::Boolean);
-
-        // evaluate works
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?;
-        let result = expr
-            .evaluate(&batch)?
-            .into_array(batch.num_rows())
-            .expect("Failed to convert to array");
-
-        let result = as_boolean_array(&result)?;
-
-        // value is correct
-        assert!(result.value(0));
-
-        Ok(())
-    }
-
-    #[test]
-    #[cfg(feature = "regex_expressions")]
-    fn test_regexp_like_all_literals() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Int32, 
false)]);
-        let execution_props = ExecutionProps::new();
-
-        let col_value = lit("aaa-555");
-        let pattern = lit(r".*-(\d*)");
-        let columns: Vec<ArrayRef> = vec![Arc::new(Int32Array::from(vec![1]))];
-        let expr = create_physical_expr_with_type_coercion(
-            &BuiltinScalarFunction::RegexpLike,
-            &[col_value, pattern],
-            &schema,
-            &execution_props,
-        )?;
-
-        // type is correct
-        assert_eq!(expr.data_type(&schema)?, DataType::Boolean);
-
-        // evaluate works
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?;
-        let result = expr
-            .evaluate(&batch)?
-            .into_array(batch.num_rows())
-            .expect("Failed to convert to array");
-
-        let result = as_boolean_array(&result)?;
-
-        // value is correct
-        assert!(result.value(0));
-
-        Ok(())
-    }
-
     // Helper function just for testing.
     // Returns `expressions` coerced to types compatible with
     // `signature`, if possible.
diff --git a/datafusion/physical-expr/src/regex_expressions.rs 
b/datafusion/physical-expr/src/regex_expressions.rs
index 846e5801af..99e6597dad 100644
--- a/datafusion/physical-expr/src/regex_expressions.rs
+++ b/datafusion/physical-expr/src/regex_expressions.rs
@@ -53,78 +53,6 @@ macro_rules! fetch_string_arg {
     }};
 }
 
-/// Tests a string using a regular expression returning true if at
-/// least one match, false otherwise.
-///
-/// The full list of supported features and syntax can be found at
-/// <https://docs.rs/regex/latest/regex/#syntax>
-///
-/// Supported flags can be found at
-/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
-///
-/// # Examples
-///
-/// ```ignore
-/// # use datafusion::prelude::*;
-/// # use datafusion::error::Result;
-/// # #[tokio::main]
-/// # async fn main() -> Result<()> {
-/// let ctx = SessionContext::new();
-/// let df = ctx.read_csv("tests/data/regex.csv", 
CsvReadOptions::new()).await?;
-///
-/// // use the regexp_like function to test col 'values',
-/// // against patterns in col 'patterns' without flags
-/// let df = df.with_column(
-///     "a",
-///     regexp_like(vec![col("values"), col("patterns")])
-/// )?;
-/// // use the regexp_like function to test col 'values',
-/// // against patterns in col 'patterns' with flags
-/// let df = df.with_column(
-///     "b",
-///     regexp_like(vec![col("values"), col("patterns"), col("flags")])
-/// )?;
-/// // literals can be used as well with dataframe calls
-/// let df = df.with_column(
-///     "c",
-///     regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
-/// )?;
-///
-/// df.show().await?;
-///
-/// # Ok(())
-/// # }
-/// ```
-pub fn regexp_like<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args.len() {
-        2 => {
-            let values = as_generic_string_array::<T>(&args[0])?;
-            let regex = as_generic_string_array::<T>(&args[1])?;
-            let array = arrow_string::regexp::regexp_is_match_utf8(values, 
regex, None)
-                .map_err(|e| arrow_datafusion_err!(e))?;
-
-            Ok(Arc::new(array) as ArrayRef)
-        }
-        3 => {
-            let values = as_generic_string_array::<T>(&args[0])?;
-            let regex = as_generic_string_array::<T>(&args[1])?;
-            let flags = as_generic_string_array::<T>(&args[2])?;
-
-            if flags.iter().any(|s| s == Some("g")) {
-                return plan_err!("regexp_like() does not support the 
\"global\" option");
-            }
-
-            let array = arrow_string::regexp::regexp_is_match_utf8(values, 
regex, Some(flags))
-                .map_err(|e| arrow_datafusion_err!(e))?;
-
-            Ok(Arc::new(array) as ArrayRef)
-        }
-        other => exec_err!(
-            "regexp_like was called with {other} arguments. It requires at 
least 2 and at most 3."
-        ),
-    }
-}
-
 /// Extract a specific group from a string column, using a regular expression.
 ///
 /// The full list of supported features and syntax can be found at
@@ -487,64 +415,6 @@ mod tests {
 
     use super::*;
 
-    #[test]
-    fn test_case_sensitive_regexp_like() {
-        let values = StringArray::from(vec!["abc"; 5]);
-
-        let patterns =
-            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", 
"^(b|c)"]);
-
-        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
-        expected_builder.append_value(true);
-        expected_builder.append_value(false);
-        expected_builder.append_value(true);
-        expected_builder.append_value(false);
-        expected_builder.append_value(false);
-        let expected = expected_builder.finish();
-
-        let re = regexp_like::<i32>(&[Arc::new(values), 
Arc::new(patterns)]).unwrap();
-
-        assert_eq!(re.as_ref(), &expected);
-    }
-
-    #[test]
-    fn test_case_insensitive_regexp_like() {
-        let values = StringArray::from(vec!["abc"; 5]);
-        let patterns =
-            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", 
"^(b|c)"]);
-        let flags = StringArray::from(vec!["i"; 5]);
-
-        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
-        expected_builder.append_value(true);
-        expected_builder.append_value(true);
-        expected_builder.append_value(true);
-        expected_builder.append_value(true);
-        expected_builder.append_value(false);
-        let expected = expected_builder.finish();
-
-        let re =
-            regexp_like::<i32>(&[Arc::new(values), Arc::new(patterns), 
Arc::new(flags)])
-                .unwrap();
-
-        assert_eq!(re.as_ref(), &expected);
-    }
-
-    #[test]
-    fn test_unsupported_global_flag_regexp_like() {
-        let values = StringArray::from(vec!["abc"]);
-        let patterns = StringArray::from(vec!["^(a)"]);
-        let flags = StringArray::from(vec!["g"]);
-
-        let re_err =
-            regexp_like::<i32>(&[Arc::new(values), Arc::new(patterns), 
Arc::new(flags)])
-                .expect_err("unsupported flag should have failed");
-
-        assert_eq!(
-            re_err.strip_backtrace(),
-            "Error during planning: regexp_like() does not support the 
\"global\" option"
-        );
-    }
-
     #[test]
     fn test_case_sensitive_regexp_match() {
         let values = StringArray::from(vec!["abc"; 5]);
diff --git a/datafusion/proto/proto/datafusion.proto 
b/datafusion/proto/proto/datafusion.proto
index 526ee7704b..c47b9abadb 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -681,7 +681,7 @@ enum ScalarFunction {
   /// 132 was InStr
   MakeDate = 133;
   ArrayReverse = 134;
-  RegexpLike = 135;
+  /// 135 is RegexpLike
   ToChar = 136;
   /// 137 was ToDate
 }
diff --git a/datafusion/proto/src/generated/pbjson.rs 
b/datafusion/proto/src/generated/pbjson.rs
index 56f0880074..c9be1bb7f3 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22438,7 +22438,6 @@ impl serde::Serialize for ScalarFunction {
             Self::EndsWith => "EndsWith",
             Self::MakeDate => "MakeDate",
             Self::ArrayReverse => "ArrayReverse",
-            Self::RegexpLike => "RegexpLike",
             Self::ToChar => "ToChar",
         };
         serializer.serialize_str(variant)
@@ -22568,7 +22567,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
             "EndsWith",
             "MakeDate",
             "ArrayReverse",
-            "RegexpLike",
             "ToChar",
         ];
 
@@ -22727,7 +22725,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
                     "EndsWith" => Ok(ScalarFunction::EndsWith),
                     "MakeDate" => Ok(ScalarFunction::MakeDate),
                     "ArrayReverse" => Ok(ScalarFunction::ArrayReverse),
-                    "RegexpLike" => Ok(ScalarFunction::RegexpLike),
                     "ToChar" => Ok(ScalarFunction::ToChar),
                     _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
diff --git a/datafusion/proto/src/generated/prost.rs 
b/datafusion/proto/src/generated/prost.rs
index 187085454a..4d19b79a3b 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2770,7 +2770,8 @@ pub enum ScalarFunction {
     /// / 132 was InStr
     MakeDate = 133,
     ArrayReverse = 134,
-    RegexpLike = 135,
+    /// / 135 is RegexpLike
+    ///
     /// / 137 was ToDate
     ToChar = 136,
 }
@@ -2898,7 +2899,6 @@ impl ScalarFunction {
             ScalarFunction::EndsWith => "EndsWith",
             ScalarFunction::MakeDate => "MakeDate",
             ScalarFunction::ArrayReverse => "ArrayReverse",
-            ScalarFunction::RegexpLike => "RegexpLike",
             ScalarFunction::ToChar => "ToChar",
         }
     }
@@ -3022,7 +3022,6 @@ impl ScalarFunction {
             "EndsWith" => Some(Self::EndsWith),
             "MakeDate" => Some(Self::MakeDate),
             "ArrayReverse" => Some(Self::ArrayReverse),
-            "RegexpLike" => Some(Self::RegexpLike),
             "ToChar" => Some(Self::ToChar),
             _ => None,
         }
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs 
b/datafusion/proto/src/logical_plan/from_proto.rs
index 327902c98b..aee53849c8 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -61,12 +61,12 @@ use datafusion_expr::{
     left, levenshtein, ln, log, log10, log2,
     logical_plan::{PlanType, StringifiedPlan},
     lower, lpad, ltrim, md5, nanvl, now, octet_length, overlay, pi, power, 
radians,
-    random, regexp_like, regexp_replace, repeat, replace, reverse, right, 
round, rpad,
-    rtrim, sha224, sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt,
-    starts_with, string_to_array, strpos, struct_fun, substr, substr_index, 
substring,
-    tan, tanh, to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, 
Between,
-    BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr,
-    GetFieldAccess, GetIndexedField, GroupingSet,
+    random, regexp_replace, repeat, replace, reverse, right, round, rpad, 
rtrim, sha224,
+    sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt, starts_with,
+    string_to_array, strpos, struct_fun, substr, substr_index, substring, tan, 
tanh,
+    to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, Between, 
BinaryExpr,
+    BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, 
GetFieldAccess,
+    GetIndexedField, GroupingSet,
     GroupingSet::GroupingSets,
     JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, 
WindowFrameBound,
     WindowFrameUnits,
@@ -530,7 +530,6 @@ impl From<&protobuf::ScalarFunction> for 
BuiltinScalarFunction {
             ScalarFunction::Left => Self::Left,
             ScalarFunction::Lpad => Self::Lpad,
             ScalarFunction::Random => Self::Random,
-            ScalarFunction::RegexpLike => Self::RegexpLike,
             ScalarFunction::RegexpReplace => Self::RegexpReplace,
             ScalarFunction::Repeat => Self::Repeat,
             ScalarFunction::Replace => Self::Replace,
@@ -1704,12 +1703,6 @@ pub fn parse_expr(
                         .map(|expr| parse_expr(expr, registry, codec))
                         .collect::<Result<Vec<_>, _>>()?,
                 )),
-                ScalarFunction::RegexpLike => Ok(regexp_like(
-                    args.to_owned()
-                        .iter()
-                        .map(|expr| parse_expr(expr, registry, codec))
-                        .collect::<Result<Vec<_>, _>>()?,
-                )),
                 ScalarFunction::RegexpReplace => Ok(regexp_replace(
                     args.to_owned()
                         .iter()
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs 
b/datafusion/proto/src/logical_plan/to_proto.rs
index ad618790a5..a4e9fd423b 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1508,7 +1508,6 @@ impl TryFrom<&BuiltinScalarFunction> for 
protobuf::ScalarFunction {
             BuiltinScalarFunction::Lpad => Self::Lpad,
             BuiltinScalarFunction::Random => Self::Random,
             BuiltinScalarFunction::Uuid => Self::Uuid,
-            BuiltinScalarFunction::RegexpLike => Self::RegexpLike,
             BuiltinScalarFunction::RegexpReplace => Self::RegexpReplace,
             BuiltinScalarFunction::Repeat => Self::Repeat,
             BuiltinScalarFunction::Replace => Self::Replace,
diff --git a/datafusion/sqllogictest/test_files/regexp.slt 
b/datafusion/sqllogictest/test_files/regexp.slt
index a80b08c41e..19966be209 100644
--- a/datafusion/sqllogictest/test_files/regexp.slt
+++ b/datafusion/sqllogictest/test_files/regexp.slt
@@ -124,6 +124,10 @@ SELECT regexp_like('(?<=[A-Z]\w )Smith', 'John Smith', 
'i');
 ----
 false
 
+query B
+select regexp_like('aaa-555', '.*-(\d*)');
+----
+true
 
 #
 # regexp_match tests


Reply via email to