alamb commented on code in PR #9137:
URL: https://github.com/apache/arrow-datafusion/pull/9137#discussion_r1482891745


##########
datafusion-examples/README.md:
##########
@@ -52,21 +52,22 @@ cargo run --example csv_sql
 - [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods 
which write data out from a DataFrame
 - [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query 
using a DataFrame against data in memory
 - [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert 
query results into rust structs using serde
-- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and anaylze 
`Expr`s
+- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and analyze 
`Expr`s
 - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run 
DataFusion as a standalone process and execute SQL queries from JDBC clients
+- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date 
function

Review Comment:
   Thank you 🙏 



##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -54,7 +54,120 @@ macro_rules! fetch_string_arg {
     }};
 }
 
-/// extract a specific group from a string column, using a regular expression
+/// Tests a string using a regular expression returning true if at
+/// least one match, false otherwise.
+///
+/// The full list of supported features and syntax can be found at
+/// <https://docs.rs/regex/latest/regex/#syntax>
+///
+/// Supported flags can be found at
+/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
+///
+/// # Examples
+///
+/// ```
+/// # use datafusion::prelude::*;
+/// # use datafusion::error::Result;
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let ctx = SessionContext::new();
+/// let df = ctx.read_csv("tests/data/regex.csv", 
CsvReadOptions::new()).await?;
+///
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' without flags
+/// let df = df.with_column(
+///     "a",
+///     regexp_like(vec![col("values"), col("patterns")])
+/// )?;
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' with flags
+/// let df = df.with_column(
+///     "b",
+///     regexp_like(vec![col("values"), col("patterns"), col("flags")])
+/// )?;
+/// // literals can be used as well with dataframe calls
+/// let df = df.with_column(
+///     "c",
+///     regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
+/// )?;
+///
+/// df.show().await?;

Review Comment:
   One way to improve these examples would be to bake in here the expected 
results
   
   For example, using pretty_print on the collected result and asserting they 
are the same. 
   
   Or maybe just copy/pasting the output as a comment



##########
datafusion/proto/proto/datafusion.proto:
##########
@@ -677,6 +677,7 @@ enum ScalarFunction {
   InStr = 132;
   MakeDate = 133;
   ArrayReverse = 134;
+  RegexpLike = 135;

Review Comment:
   Doesn't this need RegexpMatch as well? I may be missing something



##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -332,10 +482,70 @@ pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use arrow::array::*;
+
     use datafusion_common::ScalarValue;
 
+    use super::*;
+
+    #[test]
+    fn test_case_sensitive_regexp_like() {
+        let values = StringArray::from(vec!["abc"; 5]);
+
+        let patterns =
+            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", 
"^(b|c)"]);
+
+        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
+        expected_builder.append_value(true);
+        expected_builder.append_value(false);
+        expected_builder.append_value(true);
+        expected_builder.append_value(false);
+        expected_builder.append_value(false);
+        let expected = expected_builder.finish();

Review Comment:
   This probably just follows the exisitng code in this file, but it is a bit 
more concise to build `BooleanArry`s like:
   
   Something like this (untested):
   ```suggestion
           let expected = BooleanArray::from(vec![true, false, true, false, 
false]);
   ```



##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -54,7 +54,120 @@ macro_rules! fetch_string_arg {
     }};
 }
 
-/// extract a specific group from a string column, using a regular expression
+/// Tests a string using a regular expression returning true if at
+/// least one match, false otherwise.
+///
+/// The full list of supported features and syntax can be found at
+/// <https://docs.rs/regex/latest/regex/#syntax>
+///
+/// Supported flags can be found at
+/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
+///
+/// # Examples
+///
+/// ```
+/// # use datafusion::prelude::*;
+/// # use datafusion::error::Result;
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let ctx = SessionContext::new();
+/// let df = ctx.read_csv("tests/data/regex.csv", 
CsvReadOptions::new()).await?;
+///
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' without flags
+/// let df = df.with_column(
+///     "a",
+///     regexp_like(vec![col("values"), col("patterns")])
+/// )?;
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' with flags
+/// let df = df.with_column(
+///     "b",
+///     regexp_like(vec![col("values"), col("patterns"), col("flags")])
+/// )?;
+/// // literals can be used as well with dataframe calls
+/// let df = df.with_column(
+///     "c",
+///     regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
+/// )?;
+///
+/// df.show().await?;
+///
+/// # Ok(())
+/// # }
+/// ```
+pub fn regexp_like<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+    match args.len() {
+        2 => {
+            let values = as_generic_string_array::<T>(&args[0])?;
+            let regex = as_generic_string_array::<T>(&args[1])?;
+            let array = arrow_string::regexp::regexp_is_match_utf8(values, 
regex, None)
+                .map_err(|e| arrow_datafusion_err!(e))?;
+
+            Ok(Arc::new(array) as ArrayRef)
+        }
+        3 => {
+            let values = as_generic_string_array::<T>(&args[0])?;
+            let regex = as_generic_string_array::<T>(&args[1])?;
+            let flags = as_generic_string_array::<T>(&args[2])?;
+
+            if flags.iter().any(|s| s == Some("g")) {
+                return plan_err!("regexp_like() does not support the 
\"global\" option");
+            }
+
+            let array = arrow_string::regexp::regexp_is_match_utf8(values, 
regex, Some(flags))

Review Comment:
   👍 



##########
datafusion/sqllogictest/test_files/regexp.slt:
##########
@@ -0,0 +1,303 @@
+# Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   Thank you -- this is much better



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to