martin-g commented on code in PR #20354:
URL: https://github.com/apache/datafusion/pull/20354#discussion_r2817407690
##########
datafusion/functions/src/regex/regexplike.rs:
##########
@@ -314,6 +336,121 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef>
{
}
}
+fn scalar_string(value: &ScalarValue) -> Result<Option<&str>> {
+ match value {
+ ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) |
ScalarValue::Utf8View(v) => {
+ Ok(v.as_deref())
+ }
+ ScalarValue::Null => Ok(None),
+ _ => internal_err!(
+ "Unsupported data type {:?} for function `regexp_like`",
+ value.data_type()
+ ),
+ }
+}
+
+fn regexp_like_array_scalar(
+ values: &ArrayRef,
+ pattern: Option<&str>,
+ flags: Option<&str>,
+) -> Result<ArrayRef> {
+ use DataType::*;
+
+ if pattern.is_none() {
+ return Ok(Arc::new(BooleanArray::new_null(values.len())));
+ }
+
+ let pattern = pattern.unwrap();
+ let array = match values.data_type() {
+ Utf8 => {
+ let array = values.as_string::<i32>();
+ regexp::regexp_is_match_scalar(array, pattern, flags)?
+ }
+ Utf8View => {
+ let array = values.as_string_view();
+ regexp::regexp_is_match_scalar(array, pattern, flags)?
+ }
+ LargeUtf8 => {
+ let array = values.as_string::<i64>();
+ regexp::regexp_is_match_scalar(array, pattern, flags)?
+ }
+ other => {
+ return internal_err!(
+ "Unsupported data type {other:?} for function `regexp_like`"
+ );
+ }
+ };
+
+ Ok(Arc::new(array))
+}
+
+fn regexp_like_scalar(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ let flags = if args.len() == 3 {
+ match &args[2] {
+ ColumnarValue::Scalar(v) => scalar_string(v)?,
+ _ => {
+ return internal_err!(
+ "Unexpected non-scalar argument for function `regexp_like`"
+ );
+ }
+ }
+ } else {
+ None
+ };
+
+ if flags == Some("g") {
+ return plan_err!("regexp_like() does not support the \"global\"
option");
+ }
+
+ let value = match &args[0] {
+ ColumnarValue::Scalar(v) => v,
+ _ => {
+ return internal_err!(
+ "Unexpected non-scalar argument for function `regexp_like`"
+ );
+ }
+ };
+ let pattern = match &args[1] {
+ ColumnarValue::Scalar(v) => v,
+ _ => {
+ return internal_err!(
+ "Unexpected non-scalar argument for function `regexp_like`"
+ );
+ }
+ };
+
+ let value = scalar_string(value)?;
+ let pattern = scalar_string(pattern)?;
+ if value.is_none() || pattern.is_none() {
+ return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+ }
+
+ let value = value.unwrap();
+ let pattern = pattern.unwrap();
+ let result = match &args[0] {
+ ColumnarValue::Scalar(ScalarValue::Utf8(_)) => {
+ let array = StringArray::from(vec![value]);
Review Comment:
Isn't the idea of the optimisation to not construct arrays for scalar values
?
IMO this should directly use `regex::Regex`
##########
datafusion/functions/src/regex/regexplike.rs:
##########
@@ -130,29 +133,48 @@ impl ScalarUDFImpl for RegexpLikeFunc {
args: datafusion_expr::ScalarFunctionArgs,
) -> Result<ColumnarValue> {
let args = &args.args;
+ match args.len() {
+ 2 | 3 => {}
+ other => {
+ return exec_err!(
+ "`regexp_like` was called with {other} arguments. It
requires at least 2 and at most 3."
+ );
+ }
+ }
- let len = args
- .iter()
- .fold(Option::<usize>::None, |acc, arg| match arg {
- ColumnarValue::Scalar(_) => acc,
- ColumnarValue::Array(a) => Some(a.len()),
- });
-
- let is_scalar = len.is_none();
- let inferred_length = len.unwrap_or(1);
- let args = args
+ if args
.iter()
- .map(|arg| arg.to_array(inferred_length))
- .collect::<Result<Vec<_>>>()?;
-
- let result = regexp_like(&args);
- if is_scalar {
- // If all inputs are scalar, keeps output as scalar
- let result = result.and_then(|arr|
ScalarValue::try_from_array(&arr, 0));
- result.map(ColumnarValue::Scalar)
- } else {
- result.map(ColumnarValue::Array)
+ .all(|arg| matches!(arg, ColumnarValue::Scalar(_)))
+ {
+ return regexp_like_scalar(args);
}
+
+ match args.as_slice() {
+ [ColumnarValue::Array(values), ColumnarValue::Scalar(pattern)] => {
+ let pattern = scalar_string(pattern)?;
+ let array = regexp_like_array_scalar(values, pattern, None)?;
+ return Ok(ColumnarValue::Array(array));
+ }
+ [
+ ColumnarValue::Array(values),
+ ColumnarValue::Scalar(pattern),
+ ColumnarValue::Scalar(flags),
+ ] => {
+ let flags = scalar_string(flags)?;
+ if flags == Some("g") {
+ return plan_err!(
+ "regexp_like() does not support the \"global\" option"
+ );
+ }
Review Comment:
```suggestion
if let Some(flagz) = flags && flagz.contains("g") {
return plan_err!(
"regexp_like() does not support the \"global\"
option"
);
}
```
##########
datafusion/functions/src/regex/regexplike.rs:
##########
@@ -314,6 +336,121 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef>
{
}
}
+fn scalar_string(value: &ScalarValue) -> Result<Option<&str>> {
+ match value {
+ ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) |
ScalarValue::Utf8View(v) => {
+ Ok(v.as_deref())
+ }
+ ScalarValue::Null => Ok(None),
+ _ => internal_err!(
+ "Unsupported data type {:?} for function `regexp_like`",
+ value.data_type()
+ ),
+ }
+}
+
+fn regexp_like_array_scalar(
+ values: &ArrayRef,
+ pattern: Option<&str>,
+ flags: Option<&str>,
+) -> Result<ArrayRef> {
+ use DataType::*;
+
+ if pattern.is_none() {
+ return Ok(Arc::new(BooleanArray::new_null(values.len())));
+ }
+
+ let pattern = pattern.unwrap();
+ let array = match values.data_type() {
+ Utf8 => {
+ let array = values.as_string::<i32>();
+ regexp::regexp_is_match_scalar(array, pattern, flags)?
+ }
+ Utf8View => {
+ let array = values.as_string_view();
+ regexp::regexp_is_match_scalar(array, pattern, flags)?
+ }
+ LargeUtf8 => {
+ let array = values.as_string::<i64>();
+ regexp::regexp_is_match_scalar(array, pattern, flags)?
+ }
+ other => {
+ return internal_err!(
+ "Unsupported data type {other:?} for function `regexp_like`"
+ );
+ }
+ };
+
+ Ok(Arc::new(array))
+}
+
+fn regexp_like_scalar(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ let flags = if args.len() == 3 {
+ match &args[2] {
+ ColumnarValue::Scalar(v) => scalar_string(v)?,
+ _ => {
+ return internal_err!(
+ "Unexpected non-scalar argument for function `regexp_like`"
+ );
+ }
+ }
+ } else {
+ None
+ };
+
+ if flags == Some("g") {
+ return plan_err!("regexp_like() does not support the \"global\"
option");
+ }
Review Comment:
```suggestion
if let Some(flagz) = flags && flagz.contains("g") {
return plan_err!("regexp_like() does not support the \"global\"
option");
}
```
the third argument could be "ig", i.e. several flags, not just one.
##########
datafusion/functions/src/regex/regexplike.rs:
##########
@@ -130,29 +133,48 @@ impl ScalarUDFImpl for RegexpLikeFunc {
args: datafusion_expr::ScalarFunctionArgs,
) -> Result<ColumnarValue> {
let args = &args.args;
+ match args.len() {
Review Comment:
Is this check needed ?
Isn't this provided by the signature ?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]