viirya commented on code in PR #8631:
URL: https://github.com/apache/arrow-datafusion/pull/8631#discussion_r1435281598


##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -78,6 +79,82 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) 
-> Result<ArrayRef> {
     }
 }
 
+/// TODO: Remove this once it is included in arrow-rs new release.
+fn _regexp_match<OffsetSize: OffsetSizeTrait>(
+    array: &GenericStringArray<OffsetSize>,
+    regex_array: &GenericStringArray<OffsetSize>,
+    flags_array: Option<&GenericStringArray<OffsetSize>>,
+) -> std::result::Result<ArrayRef, ArrowError> {
+    let mut patterns: std::collections::HashMap<String, Regex> =
+        std::collections::HashMap::new();
+    let builder: GenericStringBuilder<OffsetSize> =
+        GenericStringBuilder::with_capacity(0, 0);
+    let mut list_builder = ListBuilder::new(builder);
+
+    let complete_pattern = match flags_array {
+        Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
+            |(pattern, flags)| {
+                pattern.map(|pattern| match flags {
+                    Some(value) => format!("(?{value}){pattern}"),
+                    None => pattern.to_string(),
+                })
+            },
+        )) as Box<dyn Iterator<Item = Option<String>>>,
+        None => Box::new(
+            regex_array
+                .iter()
+                .map(|pattern| pattern.map(|pattern| pattern.to_string())),
+        ),
+    };
+
+    array
+        .iter()
+        .zip(complete_pattern)
+        .map(|(value, pattern)| {
+            match (value, pattern) {
+                // Required for Postgres compatibility:
+                // SELECT regexp_match('foobarbequebaz', ''); = {""}
+                (Some(_), Some(pattern)) if pattern == *"" => {
+                    list_builder.values().append_value("");
+                    list_builder.append(true);
+                }
+                (Some(value), Some(pattern)) => {
+                    let existing_pattern = patterns.get(&pattern);
+                    let re = match existing_pattern {
+                        Some(re) => re,

Review Comment:
   This is actual fix. Getting rid of cloning `Regex` per row.



##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -78,6 +79,82 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) 
-> Result<ArrayRef> {
     }
 }
 
+/// TODO: Remove this once it is included in arrow-rs new release.
+fn _regexp_match<OffsetSize: OffsetSizeTrait>(
+    array: &GenericStringArray<OffsetSize>,
+    regex_array: &GenericStringArray<OffsetSize>,
+    flags_array: Option<&GenericStringArray<OffsetSize>>,
+) -> std::result::Result<ArrayRef, ArrowError> {
+    let mut patterns: std::collections::HashMap<String, Regex> =
+        std::collections::HashMap::new();
+    let builder: GenericStringBuilder<OffsetSize> =
+        GenericStringBuilder::with_capacity(0, 0);
+    let mut list_builder = ListBuilder::new(builder);
+
+    let complete_pattern = match flags_array {
+        Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
+            |(pattern, flags)| {
+                pattern.map(|pattern| match flags {
+                    Some(value) => format!("(?{value}){pattern}"),
+                    None => pattern.to_string(),
+                })
+            },
+        )) as Box<dyn Iterator<Item = Option<String>>>,
+        None => Box::new(
+            regex_array
+                .iter()
+                .map(|pattern| pattern.map(|pattern| pattern.to_string())),
+        ),
+    };
+
+    array
+        .iter()
+        .zip(complete_pattern)
+        .map(|(value, pattern)| {
+            match (value, pattern) {
+                // Required for Postgres compatibility:
+                // SELECT regexp_match('foobarbequebaz', ''); = {""}
+                (Some(_), Some(pattern)) if pattern == *"" => {
+                    list_builder.values().append_value("");
+                    list_builder.append(true);
+                }
+                (Some(value), Some(pattern)) => {
+                    let existing_pattern = patterns.get(&pattern);
+                    let re = match existing_pattern {
+                        Some(re) => re,
+                        None => {
+                            let re = Regex::new(pattern.as_str()).map_err(|e| {
+                                ArrowError::ComputeError(format!(
+                                    "Regular expression did not compile: {e:?}"
+                                ))
+                            })?;
+                            patterns.insert(pattern.clone(), re);
+                            patterns.get(&pattern).unwrap()

Review Comment:
   And here too.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to