This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 72c9505bed Improve regexp kernels performance by avoiding cloning
Regex (#5235)
72c9505bed is described below
commit 72c9505bed6260865fa95a637f37219cdc129a0e
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Sat Dec 23 02:09:55 2023 -0800
Improve regexp kernels performance by avoiding cloning Regex (#5235)
* Improve regexp_match performance by avoiding cloning Regex
* For review
---
arrow-string/src/regexp.rs | 10 ++++------
arrow/Cargo.toml | 5 +++++
arrow/benches/regexp_kernels.rs | 44 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 53 insertions(+), 6 deletions(-)
diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs
index 34bb1b0b4c..25c712d20f 100644
--- a/arrow-string/src/regexp.rs
+++ b/arrow-string/src/regexp.rs
@@ -81,15 +81,14 @@ pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
- Some(re) => re.clone(),
+ Some(re) => re,
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {e:?}"
))
})?;
- patterns.insert(pattern, re.clone());
- re
+ patterns.entry(pattern).or_insert(re)
}
};
result.append(re.is_match(value));
@@ -216,15 +215,14 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
- Some(re) => re.clone(),
+ Some(re) => re,
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {e:?}"
))
})?;
- patterns.insert(pattern, re.clone());
- re
+ patterns.entry(pattern).or_insert(re)
}
};
match re.captures(value) {
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index a6b4ddf51d..168a58b295 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -247,6 +247,11 @@ name = "substring_kernels"
harness = false
required-features = ["test_utils"]
+[[bench]]
+name = "regexp_kernels"
+harness = false
+required-features = ["test_utils"]
+
[[bench]]
name = "array_data_validate"
harness = false
diff --git a/arrow/benches/regexp_kernels.rs b/arrow/benches/regexp_kernels.rs
new file mode 100644
index 0000000000..eb38ba6783
--- /dev/null
+++ b/arrow/benches/regexp_kernels.rs
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[macro_use]
+extern crate criterion;
+use criterion::Criterion;
+
+extern crate arrow;
+
+use arrow::array::*;
+use arrow::compute::kernels::regexp::*;
+use arrow::util::bench_util::*;
+
+fn bench_regexp(arr: &GenericStringArray<i32>, regex_array:
&GenericStringArray<i32>) {
+ regexp_match(criterion::black_box(arr), regex_array, None).unwrap();
+}
+
+fn add_benchmark(c: &mut Criterion) {
+ let size = 65536;
+ let val_len = 1000;
+
+ let arr_string = create_string_array_with_len::<i32>(size, 0.0, val_len);
+ let pattern_values = vec![r".*-(\d*)-.*"; size];
+ let pattern = GenericStringArray::<i32>::from(pattern_values);
+
+ c.bench_function("regexp", |b| b.iter(|| bench_regexp(&arr_string,
&pattern)));
+}
+
+criterion_group!(benches, add_benchmark);
+criterion_main!(benches);