coderfender commented on code in PR #2831:
URL: https://github.com/apache/datafusion-comet/pull/2831#discussion_r2575900748


##########
native/spark-expr/src/string_funcs/regexp_extract.rs:
##########
@@ -0,0 +1,401 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, GenericStringArray};
+use arrow::datatypes::DataType;
+use datafusion::common::{exec_err, internal_datafusion_err, Result, 
ScalarValue};
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use regex::Regex;
+use std::sync::Arc;
+use std::any::Any;
+
+/// Spark-compatible regexp_extract function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRegExpExtract {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkRegExpExtract {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRegExpExtract {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRegExpExtract {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        // regexp_extract always returns String
+        Ok(DataType::Utf8)
+    }

Review Comment:
   @danielhumanmod , could we verify we dont return a LargeUtf8 as well ?



##########
native/spark-expr/src/string_funcs/regexp_extract.rs:
##########
@@ -0,0 +1,401 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, GenericStringArray};
+use arrow::datatypes::DataType;
+use datafusion::common::{exec_err, internal_datafusion_err, Result, 
ScalarValue};
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use regex::Regex;
+use std::sync::Arc;
+use std::any::Any;
+
+/// Spark-compatible regexp_extract function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRegExpExtract {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkRegExpExtract {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRegExpExtract {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRegExpExtract {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        // regexp_extract always returns String
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        // regexp_extract(subject, pattern, idx)
+        if args.args.len() != 3 {
+            return exec_err!(
+                "regexp_extract expects 3 arguments, got {}",
+                args.args.len()
+            );
+        }
+
+        let subject = &args.args[0];
+        let pattern = &args.args[1];
+        let idx = &args.args[2];
+
+        // Pattern must be a literal string
+        let pattern_str = match pattern {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => s.clone(),
+            _ => {
+                return exec_err!("regexp_extract pattern must be a string 
literal");
+            }
+        };
+
+        // idx must be a literal int
+        let idx_val = match idx {
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(i))) => *i as usize,
+            _ => {
+                return exec_err!("regexp_extract idx must be an integer 
literal");
+            }

Review Comment:
   Are we sure that this is only `int32` and not higher / lower ints like 
`In64` , `In8` etc ?



##########
spark/src/main/scala/org/apache/comet/serde/strings.scala:
##########
@@ -286,3 +286,83 @@ trait CommonStringExprs {
     }
   }
 }
+
+object CometRegExpExtract extends CometExpressionSerde[RegExpExtract] {
+  override def getSupportLevel(expr: RegExpExtract): SupportLevel = {
+    // Check if the pattern is compatible with Spark or allow incompatible 
patterns
+    expr.regexp match {
+      case Literal(pattern, DataTypes.StringType) =>
+        if (!RegExp.isSupportedPattern(pattern.toString) &&
+          !CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.get()) {
+          withInfo(
+            expr,
+            s"Regexp pattern $pattern is not compatible with Spark. " +
+              s"Set ${CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key}=true " +
+              "to allow it anyway.")
+          return Incompatible()
+        }
+      case _ =>
+        return Unsupported(Some("Only literal regexp patterns are supported"))
+    }
+
+    // Check if idx is a literal
+    expr.idx match {
+      case Literal(_, DataTypes.IntegerType) =>
+        Compatible()
+      case _ =>
+        Unsupported(Some("Only literal group index is supported"))

Review Comment:
   Could the input for group idx be a different numeric type (byte / int / long 
/ short etc) ? 



##########
spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala:
##########
@@ -391,4 +391,118 @@ class CometStringExpressionSuite extends CometTestBase {
     }
   }
 
+  test("regexp_extract basic") {
+    import org.apache.comet.CometConf
+
+    withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
+      val data = Seq(
+        ("100-200", 1),
+        ("300-400", 1),
+        (null, 1), // NULL input
+        ("no-match", 1), // no match → should return ""
+        ("abc123def456", 1),
+        ("", 1) // empty string
+      )
+
+      withParquetTable(data, "tbl") {
+        // Test basic extraction: group 0 (full match)
+        checkSparkAnswerAndOperator("SELECT regexp_extract(_1, 
'(\\d+)-(\\d+)', 0) FROM tbl")
+        // Test group 1
+        checkSparkAnswerAndOperator("SELECT regexp_extract(_1, 
'(\\d+)-(\\d+)', 1) FROM tbl")
+        // Test group 2
+        checkSparkAnswerAndOperator("SELECT regexp_extract(_1, 
'(\\d+)-(\\d+)', 2) FROM tbl")
+        // Test non-existent group → should return ""
+        checkSparkAnswerAndOperator("SELECT regexp_extract(_1, 
'(\\d+)-(\\d+)', 3) FROM tbl")
+        // Test empty pattern
+        checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '', 0) FROM 
tbl")
+        // Test null pattern
+        checkSparkAnswerAndOperator("SELECT regexp_extract(_1, NULL, 0) FROM 
tbl")
+      }
+    }
+  }
+
+  test("regexp_extract edge cases") {
+    import org.apache.comet.CometConf
+
+    withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
+      val data =
+        Seq(("[email protected]", 1), ("phone: 123-456-7890", 1), ("price: 
$99.99", 1), (null, 1))
+
+      withParquetTable(data, "tbl") {
+        // Extract email domain
+        checkSparkAnswerAndOperator("SELECT regexp_extract(_1, '@([^.]+)', 1) 
FROM tbl")
+        // Extract phone number
+        checkSparkAnswerAndOperator(
+          "SELECT regexp_extract(_1, '(\\d{3}-\\d{3}-\\d{4})', 1) FROM tbl")
+        // Extract price
+        checkSparkAnswerAndOperator("SELECT regexp_extract(_1, 
'\\$(\\d+\\.\\d+)', 1) FROM tbl")
+      }
+    }
+  }
+
+  test("regexp_extract_all basic") {
+    import org.apache.comet.CometConf
+
+    withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
+      val data = Seq(
+        ("a1b2c3", 1),
+        ("test123test456", 1),
+        (null, 1), // NULL input
+        ("no digits", 1), // no match → should return []
+        ("", 1) // empty string
+      )
+
+      withParquetTable(data, "tbl") {
+        // Test with explicit group 0 (full match on no-group pattern)
+        checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '\\d+', 0) 
FROM tbl")
+        // Test with explicit group 0
+        checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '(\\d+)', 
0) FROM tbl")
+        // Test group 1
+        checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '(\\d+)', 
1) FROM tbl")
+        // Test empty pattern
+        checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '', 0) FROM 
tbl")
+        // Test null pattern
+        checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, NULL, 0) 
FROM tbl")
+      }
+    }
+  }
+
+  test("regexp_extract_all multiple matches") {
+    import org.apache.comet.CometConf
+
+    withSQLConf(CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true") {
+      val data = Seq(
+        ("The prices are $10, $20, and $30", 1),
+        ("colors: red, green, blue", 1),
+        ("words: hello world", 1),
+        (null, 1))
+
+      withParquetTable(data, "tbl") {
+        // Extract all prices
+        checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, 
'\\$(\\d+)', 1) FROM tbl")
+        // Extract all words
+        checkSparkAnswerAndOperator("SELECT regexp_extract_all(_1, '([a-z]+)', 
1) FROM tbl")
+      }
+    }
+  }
+
+  test("regexp_extract_all with dictionary encoding") {
+    import org.apache.comet.CometConf
+
+    withSQLConf(
+      CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key -> "true",
+      "parquet.enable.dictionary" -> "true") {
+      // Use repeated values to trigger dictionary encoding
+      val data = (0 until 1000).map(i => {
+        val text = if (i % 3 == 0) "a1b2c3" else if (i % 3 == 1) "x5y6" else 
"no-match"

Review Comment:
   I believe that the tests should be a little more exhaustive and test out 
supr large strings / mixed inputs and multiple complex regex patterns



##########
native/spark-expr/src/string_funcs/regexp_extract.rs:
##########
@@ -0,0 +1,401 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, GenericStringArray};
+use arrow::datatypes::DataType;
+use datafusion::common::{exec_err, internal_datafusion_err, Result, 
ScalarValue};
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use regex::Regex;
+use std::sync::Arc;
+use std::any::Any;
+
+/// Spark-compatible regexp_extract function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRegExpExtract {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkRegExpExtract {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRegExpExtract {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRegExpExtract {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        // regexp_extract always returns String
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        // regexp_extract(subject, pattern, idx)
+        if args.args.len() != 3 {
+            return exec_err!(
+                "regexp_extract expects 3 arguments, got {}",
+                args.args.len()
+            );
+        }
+
+        let subject = &args.args[0];
+        let pattern = &args.args[1];
+        let idx = &args.args[2];
+
+        // Pattern must be a literal string
+        let pattern_str = match pattern {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => s.clone(),
+            _ => {
+                return exec_err!("regexp_extract pattern must be a string 
literal");
+            }
+        };
+
+        // idx must be a literal int
+        let idx_val = match idx {
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(i))) => *i as usize,
+            _ => {
+                return exec_err!("regexp_extract idx must be an integer 
literal");
+            }
+        };
+
+        // Compile regex once
+        let regex = Regex::new(&pattern_str).map_err(|e| {
+            internal_datafusion_err!("Invalid regex pattern '{}': {}", 
pattern_str, e)
+        })?;
+
+        match subject {
+            ColumnarValue::Array(array) => {
+                let result = regexp_extract_array(array, &regex, idx_val)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(s)) => {
+                let result = match s {
+                    Some(text) => Some(extract_group(text, &regex, idx_val)),
+                    None => None,  // NULL input → NULL output
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
+            }
+            _ => exec_err!("regexp_extract expects string input"),
+        }
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+}
+
+/// Spark-compatible regexp_extract_all function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRegExpExtractAll {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkRegExpExtractAll {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRegExpExtractAll {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRegExpExtractAll {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract_all"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        // regexp_extract_all returns Array<String>
+        Ok(DataType::List(Arc::new(
+            arrow::datatypes::Field::new("item", DataType::Utf8, true),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        // regexp_extract_all(subject, pattern) or regexp_extract_all(subject, 
pattern, idx)
+        if args.args.len() < 2 || args.args.len() > 3 {
+            return exec_err!(
+                "regexp_extract_all expects 2 or 3 arguments, got {}",
+                args.args.len()
+            );
+        }
+
+        let subject = &args.args[0];
+        let pattern = &args.args[1];
+        let idx_val = if args.args.len() == 3 {
+            match &args.args[2] {
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(i))) => *i as 
usize,
+                _ => {
+                    return exec_err!("regexp_extract_all idx must be an 
integer literal");

Review Comment:
   We might want to have one common DataFusion::InternalError to make sure we 
throw right exception all along 



##########
native/spark-expr/src/string_funcs/regexp_extract.rs:
##########
@@ -0,0 +1,401 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, GenericStringArray};
+use arrow::datatypes::DataType;
+use datafusion::common::{exec_err, internal_datafusion_err, Result, 
ScalarValue};
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use regex::Regex;
+use std::sync::Arc;
+use std::any::Any;
+
+/// Spark-compatible regexp_extract function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRegExpExtract {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkRegExpExtract {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRegExpExtract {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRegExpExtract {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        // regexp_extract always returns String
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        // regexp_extract(subject, pattern, idx)
+        if args.args.len() != 3 {
+            return exec_err!(
+                "regexp_extract expects 3 arguments, got {}",
+                args.args.len()
+            );
+        }
+
+        let subject = &args.args[0];
+        let pattern = &args.args[1];
+        let idx = &args.args[2];
+
+        // Pattern must be a literal string
+        let pattern_str = match pattern {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => s.clone(),
+            _ => {
+                return exec_err!("regexp_extract pattern must be a string 
literal");
+            }
+        };
+
+        // idx must be a literal int
+        let idx_val = match idx {
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(i))) => *i as usize,
+            _ => {
+                return exec_err!("regexp_extract idx must be an integer 
literal");
+            }
+        };
+
+        // Compile regex once
+        let regex = Regex::new(&pattern_str).map_err(|e| {
+            internal_datafusion_err!("Invalid regex pattern '{}': {}", 
pattern_str, e)
+        })?;
+
+        match subject {

Review Comment:
   We could probably remove some duplication in the code with regex parsing 
here 



##########
native/spark-expr/src/string_funcs/regexp_extract.rs:
##########
@@ -0,0 +1,401 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, GenericStringArray};
+use arrow::datatypes::DataType;
+use datafusion::common::{exec_err, internal_datafusion_err, Result, 
ScalarValue};
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use regex::Regex;
+use std::sync::Arc;
+use std::any::Any;
+
+/// Spark-compatible regexp_extract function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRegExpExtract {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkRegExpExtract {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRegExpExtract {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRegExpExtract {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        // regexp_extract always returns String
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        // regexp_extract(subject, pattern, idx)
+        if args.args.len() != 3 {
+            return exec_err!(
+                "regexp_extract expects 3 arguments, got {}",
+                args.args.len()
+            );
+        }
+
+        let subject = &args.args[0];
+        let pattern = &args.args[1];
+        let idx = &args.args[2];
+
+        // Pattern must be a literal string
+        let pattern_str = match pattern {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => s.clone(),
+            _ => {
+                return exec_err!("regexp_extract pattern must be a string 
literal");
+            }
+        };
+
+        // idx must be a literal int
+        let idx_val = match idx {
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(i))) => *i as usize,
+            _ => {
+                return exec_err!("regexp_extract idx must be an integer 
literal");
+            }
+        };
+
+        // Compile regex once
+        let regex = Regex::new(&pattern_str).map_err(|e| {
+            internal_datafusion_err!("Invalid regex pattern '{}': {}", 
pattern_str, e)
+        })?;
+
+        match subject {
+            ColumnarValue::Array(array) => {
+                let result = regexp_extract_array(array, &regex, idx_val)?;
+                Ok(ColumnarValue::Array(result))
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(s)) => {
+                let result = match s {
+                    Some(text) => Some(extract_group(text, &regex, idx_val)),
+                    None => None,  // NULL input → NULL output
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
+            }
+            _ => exec_err!("regexp_extract expects string input"),
+        }
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+}
+
+/// Spark-compatible regexp_extract_all function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRegExpExtractAll {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkRegExpExtractAll {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRegExpExtractAll {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRegExpExtractAll {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_extract_all"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        // regexp_extract_all returns Array<String>
+        Ok(DataType::List(Arc::new(
+            arrow::datatypes::Field::new("item", DataType::Utf8, true),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        // regexp_extract_all(subject, pattern) or regexp_extract_all(subject, 
pattern, idx)
+        if args.args.len() < 2 || args.args.len() > 3 {
+            return exec_err!(
+                "regexp_extract_all expects 2 or 3 arguments, got {}",
+                args.args.len()
+            );
+        }
+
+        let subject = &args.args[0];
+        let pattern = &args.args[1];
+        let idx_val = if args.args.len() == 3 {
+            match &args.args[2] {
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(i))) => *i as 
usize,
+                _ => {
+                    return exec_err!("regexp_extract_all idx must be an 
integer literal");
+                }
+            }
+        } else {
+            0 // default to group 0 (entire match)
+        };
+
+        // Pattern must be a literal string
+        let pattern_str = match pattern {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => s.clone(),
+            _ => {

Review Comment:
   Might have to check if the string input is `LargeUtf8` as well or ensure we 
are only sending `Utf8` from spark side



##########
spark/src/main/scala/org/apache/comet/serde/strings.scala:
##########
@@ -286,3 +286,83 @@ trait CommonStringExprs {
     }
   }
 }
+
+object CometRegExpExtract extends CometExpressionSerde[RegExpExtract] {
+  override def getSupportLevel(expr: RegExpExtract): SupportLevel = {
+    // Check if the pattern is compatible with Spark or allow incompatible 
patterns
+    expr.regexp match {
+      case Literal(pattern, DataTypes.StringType) =>
+        if (!RegExp.isSupportedPattern(pattern.toString) &&
+          !CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.get()) {
+          withInfo(
+            expr,
+            s"Regexp pattern $pattern is not compatible with Spark. " +
+              s"Set ${CometConf.COMET_REGEXP_ALLOW_INCOMPATIBLE.key}=true " +
+              "to allow it anyway.")
+          return Incompatible()
+        }
+      case _ =>
+        return Unsupported(Some("Only literal regexp patterns are supported"))
+    }
+
+    // Check if idx is a literal
+    expr.idx match {
+      case Literal(_, DataTypes.IntegerType) =>
+        Compatible()
+      case _ =>
+        Unsupported(Some("Only literal group index is supported"))
+    }
+  }
+
+  override def convert(
+      expr: RegExpExtract,
+      inputs: Seq[Attribute],
+      binding: Boolean): Option[Expr] = {
+    val subjectExpr = exprToProtoInternal(expr.subject, inputs, binding)
+    val patternExpr = exprToProtoInternal(expr.regexp, inputs, binding)
+    val idxExpr = exprToProtoInternal(expr.idx, inputs, binding)
+    val optExpr = scalarFunctionExprToProto("regexp_extract", subjectExpr, 
patternExpr, idxExpr)
+    optExprWithInfo(optExpr, expr, expr.subject, expr.regexp, expr.idx)
+  }
+}
+
+object CometRegExpExtractAll extends CometExpressionSerde[RegExpExtractAll] {
+  override def getSupportLevel(expr: RegExpExtractAll): SupportLevel = {
+    // Check if the pattern is compatible with Spark or allow incompatible 
patterns
+    expr.regexp match {
+      case Literal(pattern, DataTypes.StringType) =>
+        if (!RegExp.isSupportedPattern(pattern.toString) &&

Review Comment:
   What are the limitations  of `RegExp.isSupportedPattern(pattern.toString)` 
here ? Could we make sure that we are in absolute coherence with spark perhaps ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to