alamb commented on code in PR #9137:
URL: https://github.com/apache/arrow-datafusion/pull/9137#discussion_r1482891745
##########
datafusion-examples/README.md:
##########
@@ -52,21 +52,22 @@ cargo run --example csv_sql
- [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods
which write data out from a DataFrame
- [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query
using a DataFrame against data in memory
- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert
query results into rust structs using serde
-- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and anaylze
`Expr`s
+- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and analyze
`Expr`s
- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run
DataFusion as a standalone process and execute SQL queries from JDBC clients
+- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date
function
Review Comment:
Thank you 🙏
##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -54,7 +54,120 @@ macro_rules! fetch_string_arg {
}};
}
-/// extract a specific group from a string column, using a regular expression
+/// Tests a string using a regular expression returning true if at
+/// least one match, false otherwise.
+///
+/// The full list of supported features and syntax can be found at
+/// <https://docs.rs/regex/latest/regex/#syntax>
+///
+/// Supported flags can be found at
+/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
+///
+/// # Examples
+///
+/// ```
+/// # use datafusion::prelude::*;
+/// # use datafusion::error::Result;
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let ctx = SessionContext::new();
+/// let df = ctx.read_csv("tests/data/regex.csv",
CsvReadOptions::new()).await?;
+///
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' without flags
+/// let df = df.with_column(
+/// "a",
+/// regexp_like(vec![col("values"), col("patterns")])
+/// )?;
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' with flags
+/// let df = df.with_column(
+/// "b",
+/// regexp_like(vec![col("values"), col("patterns"), col("flags")])
+/// )?;
+/// // literals can be used as well with dataframe calls
+/// let df = df.with_column(
+/// "c",
+/// regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
+/// )?;
+///
+/// df.show().await?;
Review Comment:
One way to improve these examples would be to bake in here the expected
results
For example, using pretty_print on the collected result and asserting they
are the same.
Or maybe just copy/pasting the output as a comment
##########
datafusion/proto/proto/datafusion.proto:
##########
@@ -677,6 +677,7 @@ enum ScalarFunction {
InStr = 132;
MakeDate = 133;
ArrayReverse = 134;
+ RegexpLike = 135;
Review Comment:
Doesn't this need RegexpMatch as well? I may be missing something
##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -332,10 +482,70 @@ pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
#[cfg(test)]
mod tests {
- use super::*;
use arrow::array::*;
+
use datafusion_common::ScalarValue;
+ use super::*;
+
+ #[test]
+ fn test_case_sensitive_regexp_like() {
+ let values = StringArray::from(vec!["abc"; 5]);
+
+ let patterns =
+ StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)",
"^(b|c)"]);
+
+ let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
+ expected_builder.append_value(true);
+ expected_builder.append_value(false);
+ expected_builder.append_value(true);
+ expected_builder.append_value(false);
+ expected_builder.append_value(false);
+ let expected = expected_builder.finish();
Review Comment:
This probably just follows the exisitng code in this file, but it is a bit
more concise to build `BooleanArry`s like:
Something like this (untested):
```suggestion
let expected = BooleanArray::from(vec![true, false, true, false,
false]);
```
##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -54,7 +54,120 @@ macro_rules! fetch_string_arg {
}};
}
-/// extract a specific group from a string column, using a regular expression
+/// Tests a string using a regular expression returning true if at
+/// least one match, false otherwise.
+///
+/// The full list of supported features and syntax can be found at
+/// <https://docs.rs/regex/latest/regex/#syntax>
+///
+/// Supported flags can be found at
+/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
+///
+/// # Examples
+///
+/// ```
+/// # use datafusion::prelude::*;
+/// # use datafusion::error::Result;
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let ctx = SessionContext::new();
+/// let df = ctx.read_csv("tests/data/regex.csv",
CsvReadOptions::new()).await?;
+///
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' without flags
+/// let df = df.with_column(
+/// "a",
+/// regexp_like(vec![col("values"), col("patterns")])
+/// )?;
+/// // use the regexp_like function to test col 'values',
+/// // against patterns in col 'patterns' with flags
+/// let df = df.with_column(
+/// "b",
+/// regexp_like(vec![col("values"), col("patterns"), col("flags")])
+/// )?;
+/// // literals can be used as well with dataframe calls
+/// let df = df.with_column(
+/// "c",
+/// regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
+/// )?;
+///
+/// df.show().await?;
+///
+/// # Ok(())
+/// # }
+/// ```
+pub fn regexp_like<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+ match args.len() {
+ 2 => {
+ let values = as_generic_string_array::<T>(&args[0])?;
+ let regex = as_generic_string_array::<T>(&args[1])?;
+ let array = arrow_string::regexp::regexp_is_match_utf8(values,
regex, None)
+ .map_err(|e| arrow_datafusion_err!(e))?;
+
+ Ok(Arc::new(array) as ArrayRef)
+ }
+ 3 => {
+ let values = as_generic_string_array::<T>(&args[0])?;
+ let regex = as_generic_string_array::<T>(&args[1])?;
+ let flags = as_generic_string_array::<T>(&args[2])?;
+
+ if flags.iter().any(|s| s == Some("g")) {
+ return plan_err!("regexp_like() does not support the
\"global\" option");
+ }
+
+ let array = arrow_string::regexp::regexp_is_match_utf8(values,
regex, Some(flags))
Review Comment:
👍
##########
datafusion/sqllogictest/test_files/regexp.slt:
##########
@@ -0,0 +1,303 @@
+# Licensed to the Apache Software Foundation (ASF) under one
Review Comment:
Thank you -- this is much better
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]