Re: [PR] Port regex_extract [datafusion]

via GitHub Fri, 13 Feb 2026 17:28:11 -0800


Jefffrey commented on code in PR #20308:
URL: https://github.com/apache/datafusion/pull/20308#discussion_r2806741297



##########
datafusion/functions/src/regex/regexpextract.rs:
##########
@@ -0,0 +1,551 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regex expressions
+use arrow::array::{Array, ArrayRef, Int32Array, StringArray, StringBuilder};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use datafusion_common::exec_err;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
+use regex::Regex;
+use std::any::Any;
+use std::sync::Arc;
+
+// See 
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html
+// See 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L863
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Extract the first string in the `str` that match the 
`regexp` expression and corresponding to the regex group index",
+    syntax_example = "regexp_extract(str, regexp[, idx])",
+    sql_example = r#"```sql
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1);
+            +---------------------------------------------------------+
+            | 100                                                   |
+            +---------------------------------------------------------+
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 2);
+            +---------------------------------------------------------+
+            | 200                                                   |
+            +---------------------------------------------------------+
+```
+Additional examples can be found 
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
+"#,
+    argument(name = "str", description = "Column or column name"),
+    argument(
+        name = "regexp",
+        description = r#"a string representing a regular expression. The regex 
string should be a
+          Java regular expression.<br><br>
+          Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+          parser, see the unescaping rules at <a 
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal";>String
 Literal</a>.
+          For example, to match "\abc", a regular expression for `regexp` can 
be "^\\abc$".<br><br>
+          There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+          fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+          if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".<br><br>
+          It's recommended to use a raw string literal (with the `r` prefix) 
to avoid escaping
+          special characters in the pattern string if exists."#
+    ),
+    argument(
+        name = "idx",
+        description = r#"an integer expression that representing the group 
index. The regex maybe contains
+          multiple groups. `idx` indicates which regex group to extract. The 
group index should
+          be non-negative. The minimum value of `idx` is 0, which means 
matching the entire
+          regular expression. If `idx` is not specified, the default group 
index value is 1.
+          This parameter is optional; when omitted the function defaults to 
extracting the first
+          capture group (idx=1), matching Spark's behavior."#
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct RegexpExtractFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpExtractFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpExtractFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Spark Catalyst Expression: RegExpExtract(subject, 
regexp, idx)
+                    // where idx defaults to 1 when omitted.
+                    // See: 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+                    //
+                    // 2-arg form: regexp_extract(str, regexp) — idx defaults 
to 1
+                    // Matches Spark's: def this(s: Expression, r: Expression) 
= this(s, r, Literal(1))
+                    TypeSignature::Exact(vec![Utf8, Utf8]),
+                    // 3-arg form: regexp_extract(str, regexp, idx)
+                    TypeSignature::Exact(vec![Utf8, Utf8, Int32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpExtractFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;

Review Comment:
   We don't need all these checks in `return_type`; we can simply return 
`Ok(Utf8)` as signature should guard this for us



##########
datafusion/functions/src/regex/regexpextract.rs:
##########
@@ -0,0 +1,551 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regex expressions
+use arrow::array::{Array, ArrayRef, Int32Array, StringArray, StringBuilder};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use datafusion_common::exec_err;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
+use regex::Regex;
+use std::any::Any;
+use std::sync::Arc;
+
+// See 
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html

Review Comment:
   Same here



##########
datafusion/functions/src/regex/regexpextract.rs:
##########
@@ -0,0 +1,551 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regex expressions
+use arrow::array::{Array, ArrayRef, Int32Array, StringArray, StringBuilder};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use datafusion_common::exec_err;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
+use regex::Regex;
+use std::any::Any;
+use std::sync::Arc;
+
+// See 
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html
+// See 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L863
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Extract the first string in the `str` that match the 
`regexp` expression and corresponding to the regex group index",
+    syntax_example = "regexp_extract(str, regexp[, idx])",
+    sql_example = r#"```sql
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1);
+            +---------------------------------------------------------+
+            | 100                                                   |
+            +---------------------------------------------------------+
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 2);
+            +---------------------------------------------------------+
+            | 200                                                   |
+            +---------------------------------------------------------+
+```
+Additional examples can be found 
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
+"#,
+    argument(name = "str", description = "Column or column name"),
+    argument(
+        name = "regexp",
+        description = r#"a string representing a regular expression. The regex 
string should be a
+          Java regular expression.<br><br>
+          Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+          parser, see the unescaping rules at <a 
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal";>String
 Literal</a>.
+          For example, to match "\abc", a regular expression for `regexp` can 
be "^\\abc$".<br><br>
+          There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+          fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+          if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".<br><br>
+          It's recommended to use a raw string literal (with the `r` prefix) 
to avoid escaping
+          special characters in the pattern string if exists."#
+    ),
+    argument(
+        name = "idx",
+        description = r#"an integer expression that representing the group 
index. The regex maybe contains
+          multiple groups. `idx` indicates which regex group to extract. The 
group index should
+          be non-negative. The minimum value of `idx` is 0, which means 
matching the entire
+          regular expression. If `idx` is not specified, the default group 
index value is 1.
+          This parameter is optional; when omitted the function defaults to 
extracting the first
+          capture group (idx=1), matching Spark's behavior."#
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct RegexpExtractFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpExtractFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpExtractFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Spark Catalyst Expression: RegExpExtract(subject, 
regexp, idx)
+                    // where idx defaults to 1 when omitted.
+                    // See: 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+                    //
+                    // 2-arg form: regexp_extract(str, regexp) — idx defaults 
to 1
+                    // Matches Spark's: def this(s: Expression, r: Expression) 
= this(s, r, Literal(1))
+                    TypeSignature::Exact(vec![Utf8, Utf8]),
+                    // 3-arg form: regexp_extract(str, regexp, idx)
+                    TypeSignature::Exact(vec![Utf8, Utf8, Int32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpExtractFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;
+        // Spark's RegExpExtract always returns StringType
+        match arg_types.len() {
+            2 | 3 => match arg_types[0] {
+                Utf8 => Ok(Utf8),
+                _ => exec_err!("regexp_extract only supports Utf8 for arg0"),
+            },
+            _ => exec_err!(
+                "regexp_extract expects 2 or 3 arguments, got {}",
+                arg_types.len()
+            ),
+        }
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        if args.len() != 2 && args.len() != 3 {
+            return exec_err!("regexp_extract expects 2 or 3 arguments");
+        }
+
+        // DataFusion passes either scalars or arrays. Convert to arrays.
+        let len = args
+            .iter()
+            .map(|v| match v {
+                ColumnarValue::Array(a) => a.len(),
+                ColumnarValue::Scalar(_) => 1,
+            })
+            .max()
+            .unwrap_or(1);
+
+        let a0 = args[0].to_array(len)?;
+        let a1 = args[1].to_array(len)?;
+
+        // Spark Catalyst: def this(s, r) = this(s, r, Literal(1))
+        // When idx is omitted, default to group index 1.
+        let a2 = if args.len() == 3 {
+            args[2].to_array(len)?
+        } else {
+            // Default idx = 1, matching Spark's behavior
+            Arc::new(Int32Array::from(vec![1; len])) as ArrayRef
+        };
+
+        let out: ArrayRef = regexp_extract(&[a0, a1, a2])?;
+        Ok(ColumnarValue::Array(out))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Helper to build args for tests and external callers.
+pub fn regexp_extract(args: &[ArrayRef]) -> Result<ArrayRef> {

Review Comment:
   This doesn't need to be public



##########
datafusion/functions/src/regex/mod.rs:
##########
@@ -65,6 +67,19 @@ pub mod expr_fn {
         super::regexp_match().call(args)
     }
 
+    /// Extracts a group that matches `regexp`. If `idx` is not specified,
+    /// it defaults to 1.
+    ///
+    /// Matches Spark's DataFrame API: `regexp_extract(e: Column, exp: String, 
groupIdx: Int)`

Review Comment:
   We probably should remove mention of Spark since we're adding this as a 
DataFusion function (i.e. not to the `datafusion-spark` crate)



##########
datafusion/functions/src/regex/regexpextract.rs:
##########
@@ -0,0 +1,551 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regex expressions
+use arrow::array::{Array, ArrayRef, Int32Array, StringArray, StringBuilder};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use datafusion_common::exec_err;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
+use regex::Regex;
+use std::any::Any;
+use std::sync::Arc;
+
+// See 
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html
+// See 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L863
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Extract the first string in the `str` that match the 
`regexp` expression and corresponding to the regex group index",
+    syntax_example = "regexp_extract(str, regexp[, idx])",
+    sql_example = r#"```sql
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1);
+            +---------------------------------------------------------+
+            | 100                                                   |
+            +---------------------------------------------------------+
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 2);
+            +---------------------------------------------------------+
+            | 200                                                   |
+            +---------------------------------------------------------+
+```
+Additional examples can be found 
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
+"#,
+    argument(name = "str", description = "Column or column name"),
+    argument(
+        name = "regexp",
+        description = r#"a string representing a regular expression. The regex 
string should be a
+          Java regular expression.<br><br>
+          Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+          parser, see the unescaping rules at <a 
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal";>String
 Literal</a>.
+          For example, to match "\abc", a regular expression for `regexp` can 
be "^\\abc$".<br><br>
+          There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+          fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+          if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".<br><br>
+          It's recommended to use a raw string literal (with the `r` prefix) 
to avoid escaping
+          special characters in the pattern string if exists."#
+    ),
+    argument(
+        name = "idx",
+        description = r#"an integer expression that representing the group 
index. The regex maybe contains
+          multiple groups. `idx` indicates which regex group to extract. The 
group index should
+          be non-negative. The minimum value of `idx` is 0, which means 
matching the entire
+          regular expression. If `idx` is not specified, the default group 
index value is 1.
+          This parameter is optional; when omitted the function defaults to 
extracting the first
+          capture group (idx=1), matching Spark's behavior."#
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct RegexpExtractFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpExtractFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpExtractFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Spark Catalyst Expression: RegExpExtract(subject, 
regexp, idx)
+                    // where idx defaults to 1 when omitted.
+                    // See: 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+                    //
+                    // 2-arg form: regexp_extract(str, regexp) — idx defaults 
to 1
+                    // Matches Spark's: def this(s: Expression, r: Expression) 
= this(s, r, Literal(1))
+                    TypeSignature::Exact(vec![Utf8, Utf8]),
+                    // 3-arg form: regexp_extract(str, regexp, idx)
+                    TypeSignature::Exact(vec![Utf8, Utf8, Int32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpExtractFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;
+        // Spark's RegExpExtract always returns StringType
+        match arg_types.len() {
+            2 | 3 => match arg_types[0] {
+                Utf8 => Ok(Utf8),
+                _ => exec_err!("regexp_extract only supports Utf8 for arg0"),
+            },
+            _ => exec_err!(
+                "regexp_extract expects 2 or 3 arguments, got {}",
+                arg_types.len()
+            ),
+        }
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        if args.len() != 2 && args.len() != 3 {
+            return exec_err!("regexp_extract expects 2 or 3 arguments");
+        }
+
+        // DataFusion passes either scalars or arrays. Convert to arrays.
+        let len = args
+            .iter()
+            .map(|v| match v {
+                ColumnarValue::Array(a) => a.len(),
+                ColumnarValue::Scalar(_) => 1,
+            })
+            .max()
+            .unwrap_or(1);
+
+        let a0 = args[0].to_array(len)?;
+        let a1 = args[1].to_array(len)?;
+
+        // Spark Catalyst: def this(s, r) = this(s, r, Literal(1))
+        // When idx is omitted, default to group index 1.
+        let a2 = if args.len() == 3 {
+            args[2].to_array(len)?
+        } else {
+            // Default idx = 1, matching Spark's behavior
+            Arc::new(Int32Array::from(vec![1; len])) as ArrayRef
+        };
+
+        let out: ArrayRef = regexp_extract(&[a0, a1, a2])?;
+        Ok(ColumnarValue::Array(out))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Helper to build args for tests and external callers.
+pub fn regexp_extract(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() != 3 {
+        return exec_err!("regexp_extract expects 3 arguments");
+    }
+
+    match args[0].data_type() {
+        // TODO: DataType::Utf8View => regexp_extract_utf8_view(args),
+        DataType::Utf8 => regexp_extract_utf8(args),
+        // TODO: DataType::LargeUtf8 => regexp_extract_large_utf8(args),
+        other => exec_err!("regexp_extract unsupported input type {other:?}"),
+    }
+}
+
+fn regexp_extract_utf8(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let values = &args[0];
+    let pattern = &args[1];
+    let index = &args[2];
+
+    let values_array = values
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .ok_or_else(|| DataFusionError::Execution("arg0 must be 
Utf8".to_string()))?;
+    let pattern_array = pattern
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .ok_or_else(|| DataFusionError::Execution("arg1 must be 
Utf8".to_string()))?;
+    let index_array = index
+        .as_any()
+        .downcast_ref::<Int32Array>()
+        .ok_or_else(|| DataFusionError::Execution("arg2 must be 
Int32".to_string()))?;
+
+    let mut out = StringBuilder::new();
+
+    // Now iterate over the values, pattern, and index arrays and extract the 
matches
+
+    for i in 0..values_array.len() {
+        if values_array.is_null(i) || pattern_array.is_null(i) || 
index_array.is_null(i) {
+            out.append_null();
+            continue;
+        }
+
+        let value = values_array.value(i);
+        let pattern = pattern_array.value(i);
+        let index = index_array.value(i);
+
+        if index < 0 {
+            return exec_err!(
+                "The value of idx in regexp_extract must be non-negative, but 
got {}",
+                index
+            );
+        }
+
+        let group_index = index as usize;
+
+        let regex =
+            Regex::new(pattern).map_err(|e| 
ArrowError::ComputeError(e.to_string()))?;
+
+        // Validate group index doesn't exceed the number of capture groups
+        let num_groups = regex.captures_len();
+        if group_index >= num_groups {
+            return exec_err!(
+                "Regex group index {} exceeds the number of groups {} in 
pattern '{}'",
+                group_index,
+                num_groups - 1,
+                pattern
+            );
+        }
+
+        if let Some(cap) = regex.captures(value) {
+            // cap.get() returns None for unmatched optional groups like (b)?
+            // Spark returns empty string in that case
+            match cap.get(group_index) {
+                Some(m) => out.append_value(m.as_str()),
+                None => out.append_value(""),
+            }
+        } else {
+            // No match at all — Spark returns empty string, not null
+            out.append_value("");
+        }
+    }
+
+    Ok(Arc::new(out.finish()))
+}
+
+#[cfg(test)]
+mod tests {

Review Comment:
   Could we move all these tests to be SLTs instead?



##########
datafusion/functions/src/regex/regexpextract.rs:
##########
@@ -0,0 +1,551 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regex expressions
+use arrow::array::{Array, ArrayRef, Int32Array, StringArray, StringBuilder};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use datafusion_common::exec_err;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
+use regex::Regex;
+use std::any::Any;
+use std::sync::Arc;
+
+// See 
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html
+// See 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L863
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Extract the first string in the `str` that match the 
`regexp` expression and corresponding to the regex group index",
+    syntax_example = "regexp_extract(str, regexp[, idx])",
+    sql_example = r#"```sql
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1);
+            +---------------------------------------------------------+
+            | 100                                                   |
+            +---------------------------------------------------------+
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 2);
+            +---------------------------------------------------------+
+            | 200                                                   |
+            +---------------------------------------------------------+
+```
+Additional examples can be found 
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
+"#,
+    argument(name = "str", description = "Column or column name"),
+    argument(
+        name = "regexp",
+        description = r#"a string representing a regular expression. The regex 
string should be a
+          Java regular expression.<br><br>
+          Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+          parser, see the unescaping rules at <a 
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal";>String
 Literal</a>.
+          For example, to match "\abc", a regular expression for `regexp` can 
be "^\\abc$".<br><br>
+          There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+          fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+          if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".<br><br>
+          It's recommended to use a raw string literal (with the `r` prefix) 
to avoid escaping
+          special characters in the pattern string if exists."#
+    ),
+    argument(
+        name = "idx",
+        description = r#"an integer expression that representing the group 
index. The regex maybe contains
+          multiple groups. `idx` indicates which regex group to extract. The 
group index should
+          be non-negative. The minimum value of `idx` is 0, which means 
matching the entire
+          regular expression. If `idx` is not specified, the default group 
index value is 1.
+          This parameter is optional; when omitted the function defaults to 
extracting the first
+          capture group (idx=1), matching Spark's behavior."#
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct RegexpExtractFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpExtractFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpExtractFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Spark Catalyst Expression: RegExpExtract(subject, 
regexp, idx)
+                    // where idx defaults to 1 when omitted.
+                    // See: 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+                    //
+                    // 2-arg form: regexp_extract(str, regexp) — idx defaults 
to 1
+                    // Matches Spark's: def this(s: Expression, r: Expression) 
= this(s, r, Literal(1))
+                    TypeSignature::Exact(vec![Utf8, Utf8]),
+                    // 3-arg form: regexp_extract(str, regexp, idx)
+                    TypeSignature::Exact(vec![Utf8, Utf8, Int32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpExtractFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;
+        // Spark's RegExpExtract always returns StringType
+        match arg_types.len() {
+            2 | 3 => match arg_types[0] {
+                Utf8 => Ok(Utf8),
+                _ => exec_err!("regexp_extract only supports Utf8 for arg0"),
+            },
+            _ => exec_err!(
+                "regexp_extract expects 2 or 3 arguments, got {}",
+                arg_types.len()
+            ),
+        }
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        if args.len() != 2 && args.len() != 3 {
+            return exec_err!("regexp_extract expects 2 or 3 arguments");
+        }
+
+        // DataFusion passes either scalars or arrays. Convert to arrays.
+        let len = args

Review Comment:
   We should just use 
[`make_scalar_function`](https://docs.rs/datafusion/latest/datafusion/functions/utils/fn.make_scalar_function.html)
 which handles this boilerplate for us if we don't want to deal with 
columnarvalues



##########
datafusion/functions/src/regex/regexpextract.rs:
##########
@@ -0,0 +1,551 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regex expressions
+use arrow::array::{Array, ArrayRef, Int32Array, StringArray, StringBuilder};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use datafusion_common::exec_err;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
+use regex::Regex;
+use std::any::Any;
+use std::sync::Arc;
+
+// See 
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html
+// See 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L863
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Extract the first string in the `str` that match the 
`regexp` expression and corresponding to the regex group index",
+    syntax_example = "regexp_extract(str, regexp[, idx])",
+    sql_example = r#"```sql
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1);
+            +---------------------------------------------------------+
+            | 100                                                   |
+            +---------------------------------------------------------+
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 2);
+            +---------------------------------------------------------+
+            | 200                                                   |
+            +---------------------------------------------------------+
+```
+Additional examples can be found 
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
+"#,
+    argument(name = "str", description = "Column or column name"),
+    argument(
+        name = "regexp",
+        description = r#"a string representing a regular expression. The regex 
string should be a
+          Java regular expression.<br><br>
+          Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+          parser, see the unescaping rules at <a 
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal";>String
 Literal</a>.
+          For example, to match "\abc", a regular expression for `regexp` can 
be "^\\abc$".<br><br>
+          There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+          fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+          if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".<br><br>
+          It's recommended to use a raw string literal (with the `r` prefix) 
to avoid escaping
+          special characters in the pattern string if exists."#
+    ),
+    argument(
+        name = "idx",
+        description = r#"an integer expression that representing the group 
index. The regex maybe contains
+          multiple groups. `idx` indicates which regex group to extract. The 
group index should
+          be non-negative. The minimum value of `idx` is 0, which means 
matching the entire
+          regular expression. If `idx` is not specified, the default group 
index value is 1.
+          This parameter is optional; when omitted the function defaults to 
extracting the first
+          capture group (idx=1), matching Spark's behavior."#
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct RegexpExtractFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpExtractFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpExtractFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Spark Catalyst Expression: RegExpExtract(subject, 
regexp, idx)
+                    // where idx defaults to 1 when omitted.
+                    // See: 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+                    //
+                    // 2-arg form: regexp_extract(str, regexp) — idx defaults 
to 1
+                    // Matches Spark's: def this(s: Expression, r: Expression) 
= this(s, r, Literal(1))
+                    TypeSignature::Exact(vec![Utf8, Utf8]),
+                    // 3-arg form: regexp_extract(str, regexp, idx)
+                    TypeSignature::Exact(vec![Utf8, Utf8, Int32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpExtractFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;
+        // Spark's RegExpExtract always returns StringType
+        match arg_types.len() {
+            2 | 3 => match arg_types[0] {
+                Utf8 => Ok(Utf8),
+                _ => exec_err!("regexp_extract only supports Utf8 for arg0"),
+            },
+            _ => exec_err!(
+                "regexp_extract expects 2 or 3 arguments, got {}",
+                arg_types.len()
+            ),
+        }
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        if args.len() != 2 && args.len() != 3 {
+            return exec_err!("regexp_extract expects 2 or 3 arguments");
+        }
+
+        // DataFusion passes either scalars or arrays. Convert to arrays.
+        let len = args
+            .iter()
+            .map(|v| match v {
+                ColumnarValue::Array(a) => a.len(),
+                ColumnarValue::Scalar(_) => 1,
+            })
+            .max()
+            .unwrap_or(1);
+
+        let a0 = args[0].to_array(len)?;
+        let a1 = args[1].to_array(len)?;
+
+        // Spark Catalyst: def this(s, r) = this(s, r, Literal(1))
+        // When idx is omitted, default to group index 1.
+        let a2 = if args.len() == 3 {
+            args[2].to_array(len)?
+        } else {
+            // Default idx = 1, matching Spark's behavior
+            Arc::new(Int32Array::from(vec![1; len])) as ArrayRef
+        };
+
+        let out: ArrayRef = regexp_extract(&[a0, a1, a2])?;
+        Ok(ColumnarValue::Array(out))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Helper to build args for tests and external callers.
+pub fn regexp_extract(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() != 3 {

Review Comment:
   If it needs 3 arguments we should make the signature 3 distinct arguments 
instead of a slice



##########
datafusion/functions/src/regex/regexpextract.rs:
##########
@@ -0,0 +1,551 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Regex expressions
+use arrow::array::{Array, ArrayRef, Int32Array, StringArray, StringBuilder};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use datafusion_common::exec_err;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
+use regex::Regex;
+use std::any::Any;
+use std::sync::Arc;
+
+// See 
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html
+// See 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L863
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Extract the first string in the `str` that match the 
`regexp` expression and corresponding to the regex group index",
+    syntax_example = "regexp_extract(str, regexp[, idx])",
+    sql_example = r#"```sql
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1);
+            +---------------------------------------------------------+
+            | 100                                                   |
+            +---------------------------------------------------------+
+            > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 2);
+            +---------------------------------------------------------+
+            | 200                                                   |
+            +---------------------------------------------------------+
+```
+Additional examples can be found 
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
+"#,
+    argument(name = "str", description = "Column or column name"),
+    argument(
+        name = "regexp",
+        description = r#"a string representing a regular expression. The regex 
string should be a
+          Java regular expression.<br><br>
+          Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+          parser, see the unescaping rules at <a 
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal";>String
 Literal</a>.
+          For example, to match "\abc", a regular expression for `regexp` can 
be "^\\abc$".<br><br>
+          There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+          fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+          if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".<br><br>
+          It's recommended to use a raw string literal (with the `r` prefix) 
to avoid escaping
+          special characters in the pattern string if exists."#
+    ),
+    argument(
+        name = "idx",
+        description = r#"an integer expression that representing the group 
index. The regex maybe contains
+          multiple groups. `idx` indicates which regex group to extract. The 
group index should
+          be non-negative. The minimum value of `idx` is 0, which means 
matching the entire
+          regular expression. If `idx` is not specified, the default group 
index value is 1.
+          This parameter is optional; when omitted the function defaults to 
extracting the first
+          capture group (idx=1), matching Spark's behavior."#
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct RegexpExtractFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpExtractFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpExtractFunc {
+    pub fn new() -> Self {
+        use DataType::*;
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Spark Catalyst Expression: RegExpExtract(subject, 
regexp, idx)
+                    // where idx defaults to 1 when omitted.
+                    // See: 
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+                    //
+                    // 2-arg form: regexp_extract(str, regexp) — idx defaults 
to 1
+                    // Matches Spark's: def this(s: Expression, r: Expression) 
= this(s, r, Literal(1))
+                    TypeSignature::Exact(vec![Utf8, Utf8]),
+                    // 3-arg form: regexp_extract(str, regexp, idx)
+                    TypeSignature::Exact(vec![Utf8, Utf8, Int32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpExtractFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;
+        // Spark's RegExpExtract always returns StringType
+        match arg_types.len() {
+            2 | 3 => match arg_types[0] {
+                Utf8 => Ok(Utf8),
+                _ => exec_err!("regexp_extract only supports Utf8 for arg0"),
+            },
+            _ => exec_err!(
+                "regexp_extract expects 2 or 3 arguments, got {}",
+                arg_types.len()
+            ),
+        }
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        if args.len() != 2 && args.len() != 3 {
+            return exec_err!("regexp_extract expects 2 or 3 arguments");
+        }
+
+        // DataFusion passes either scalars or arrays. Convert to arrays.
+        let len = args
+            .iter()
+            .map(|v| match v {
+                ColumnarValue::Array(a) => a.len(),
+                ColumnarValue::Scalar(_) => 1,
+            })
+            .max()
+            .unwrap_or(1);
+
+        let a0 = args[0].to_array(len)?;
+        let a1 = args[1].to_array(len)?;
+
+        // Spark Catalyst: def this(s, r) = this(s, r, Literal(1))
+        // When idx is omitted, default to group index 1.
+        let a2 = if args.len() == 3 {
+            args[2].to_array(len)?
+        } else {
+            // Default idx = 1, matching Spark's behavior
+            Arc::new(Int32Array::from(vec![1; len])) as ArrayRef
+        };
+
+        let out: ArrayRef = regexp_extract(&[a0, a1, a2])?;
+        Ok(ColumnarValue::Array(out))
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Helper to build args for tests and external callers.
+pub fn regexp_extract(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() != 3 {
+        return exec_err!("regexp_extract expects 3 arguments");
+    }
+
+    match args[0].data_type() {
+        // TODO: DataType::Utf8View => regexp_extract_utf8_view(args),
+        DataType::Utf8 => regexp_extract_utf8(args),
+        // TODO: DataType::LargeUtf8 => regexp_extract_large_utf8(args),
+        other => exec_err!("regexp_extract unsupported input type {other:?}"),
+    }
+}
+
+fn regexp_extract_utf8(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let values = &args[0];
+    let pattern = &args[1];
+    let index = &args[2];
+
+    let values_array = values

Review Comment:
   Can use 
[`as_string_array`](https://docs.rs/datafusion/latest/datafusion/common/cast/fn.as_string_array.html)
 for easier downcasting here; same idea for int array below too



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Port regex_extract [datafusion]

Reply via email to