This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 72c9505bed Improve regexp kernels performance by avoiding cloning 
Regex (#5235)
72c9505bed is described below

commit 72c9505bed6260865fa95a637f37219cdc129a0e
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Sat Dec 23 02:09:55 2023 -0800

    Improve regexp kernels performance by avoiding cloning Regex (#5235)
    
    * Improve regexp_match performance by avoiding cloning Regex
    
    * For review
---
 arrow-string/src/regexp.rs      | 10 ++++------
 arrow/Cargo.toml                |  5 +++++
 arrow/benches/regexp_kernels.rs | 44 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs
index 34bb1b0b4c..25c712d20f 100644
--- a/arrow-string/src/regexp.rs
+++ b/arrow-string/src/regexp.rs
@@ -81,15 +81,14 @@ pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
                 (Some(value), Some(pattern)) => {
                     let existing_pattern = patterns.get(&pattern);
                     let re = match existing_pattern {
-                        Some(re) => re.clone(),
+                        Some(re) => re,
                         None => {
                             let re = Regex::new(pattern.as_str()).map_err(|e| {
                                 ArrowError::ComputeError(format!(
                                     "Regular expression did not compile: {e:?}"
                                 ))
                             })?;
-                            patterns.insert(pattern, re.clone());
-                            re
+                            patterns.entry(pattern).or_insert(re)
                         }
                     };
                     result.append(re.is_match(value));
@@ -216,15 +215,14 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
                 (Some(value), Some(pattern)) => {
                     let existing_pattern = patterns.get(&pattern);
                     let re = match existing_pattern {
-                        Some(re) => re.clone(),
+                        Some(re) => re,
                         None => {
                             let re = Regex::new(pattern.as_str()).map_err(|e| {
                                 ArrowError::ComputeError(format!(
                                     "Regular expression did not compile: {e:?}"
                                 ))
                             })?;
-                            patterns.insert(pattern, re.clone());
-                            re
+                            patterns.entry(pattern).or_insert(re)
                         }
                     };
                     match re.captures(value) {
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index a6b4ddf51d..168a58b295 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -247,6 +247,11 @@ name = "substring_kernels"
 harness = false
 required-features = ["test_utils"]
 
+[[bench]]
+name = "regexp_kernels"
+harness = false
+required-features = ["test_utils"]
+
 [[bench]]
 name = "array_data_validate"
 harness = false
diff --git a/arrow/benches/regexp_kernels.rs b/arrow/benches/regexp_kernels.rs
new file mode 100644
index 0000000000..eb38ba6783
--- /dev/null
+++ b/arrow/benches/regexp_kernels.rs
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[macro_use]
+extern crate criterion;
+use criterion::Criterion;
+
+extern crate arrow;
+
+use arrow::array::*;
+use arrow::compute::kernels::regexp::*;
+use arrow::util::bench_util::*;
+
+fn bench_regexp(arr: &GenericStringArray<i32>, regex_array: 
&GenericStringArray<i32>) {
+    regexp_match(criterion::black_box(arr), regex_array, None).unwrap();
+}
+
+fn add_benchmark(c: &mut Criterion) {
+    let size = 65536;
+    let val_len = 1000;
+
+    let arr_string = create_string_array_with_len::<i32>(size, 0.0, val_len);
+    let pattern_values = vec![r".*-(\d*)-.*"; size];
+    let pattern = GenericStringArray::<i32>::from(pattern_values);
+
+    c.bench_function("regexp", |b| b.iter(|| bench_regexp(&arr_string, 
&pattern)));
+}
+
+criterion_group!(benches, add_benchmark);
+criterion_main!(benches);

Reply via email to