This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new d05cf6d5e Implement native support StringViewArray for
`regexp_is_match` and `regexp_is_match_scalar` function, deprecate
`regexp_is_match_utf8` and `regexp_is_match_utf8_scalar` (#6376)
d05cf6d5e is described below
commit d05cf6d5e74e79ddcacaa4a68bddaba230b0f163
Author: Tai Le Manh <[email protected]>
AuthorDate: Sat Sep 21 13:44:45 2024 -0700
Implement native support StringViewArray for `regexp_is_match` and
`regexp_is_match_scalar` function, deprecate `regexp_is_match_utf8` and
`regexp_is_match_utf8_scalar` (#6376)
* Implement native support StringViewArray for regex_is_match function
* Update test cases cover StringViewArray length more then 12 bytes
* Add StringView benchmark for regexp_is_match
Signed-off-by: Tai Le Manh <[email protected]>
* Implement native support StringViewArray for regex_is_match function
Signed-off-by: Tai Le Manh <[email protected]>
* Remove duplicate implementation, fix clippy, add docs
more
---------
Signed-off-by: Tai Le Manh <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-string/src/like.rs | 2 +-
arrow-string/src/regexp.rs | 228 ++++++++++++++++++++++++++++++++----
arrow/benches/comparison_kernels.rs | 67 +++++++++--
arrow/src/compute/kernels.rs | 3 +
4 files changed, 261 insertions(+), 39 deletions(-)
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 4626be136..4a6c5bab9 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -155,7 +155,7 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) ->
Result<BooleanArray, Arr
///
/// This trait helps to abstract over the different types of string arrays
/// so that we don't need to duplicate the implementation for each type.
-trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
fn is_ascii(&self) -> bool;
fn iter(&self) -> ArrayIter<Self>;
}
diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs
index f79eff4b6..5ad452a17 100644
--- a/arrow-string/src/regexp.rs
+++ b/arrow-string/src/regexp.rs
@@ -18,6 +18,8 @@
//! Defines kernel to extract substrings based on a regular
//! expression of a \[Large\]StringArray
+use crate::like::StringArrayType;
+
use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder,
ListBuilder};
use arrow_array::cast::AsArray;
use arrow_array::*;
@@ -25,6 +27,7 @@ use arrow_buffer::NullBuffer;
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field};
use regex::Regex;
+
use std::collections::HashMap;
use std::sync::Arc;
@@ -35,16 +38,64 @@ use std::sync::Arc;
/// special search modes, such as case insensitive and multi-line mode.
/// See the documentation
[here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
/// for more information.
+#[deprecated(since = "54.0.0", note = "please use `regex_is_match` instead")]
pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex_array: &GenericStringArray<OffsetSize>,
flags_array: Option<&GenericStringArray<OffsetSize>>,
) -> Result<BooleanArray, ArrowError> {
+ regexp_is_match(array, regex_array, flags_array)
+}
+
+/// Return BooleanArray indicating which strings in an array match an array of
+/// regular expressions.
+///
+/// This is equivalent to the SQL `array ~ regex_array`, supporting
+/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`].
+///
+/// If `regex_array` element has an empty value, the corresponding result
value is always true.
+///
+/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] /
[`StringViewArray`] flag,
+/// which allow special search modes, such as case-insensitive and multi-line
mode.
+/// See the documentation
[here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
+/// for more information.
+///
+/// # See Also
+/// * [`regexp_is_match_scalar`] for matching a single regular expression
against an array of strings
+/// * [`regexp_match`] for extracting groups from a string array based on a
regular expression
+///
+/// # Example
+/// ```
+/// # use arrow_array::{StringArray, BooleanArray};
+/// # use arrow_string::regexp::regexp_is_match;
+/// // First array is the array of strings to match
+/// let array = StringArray::from(vec!["Foo", "Bar", "FooBar", "Baz"]);
+/// // Second array is the array of regular expressions to match against
+/// let regex_array = StringArray::from(vec!["^Foo", "^Foo", "Bar$", "Baz"]);
+/// // Third array is the array of flags to use for each regular expression,
if desired
+/// // (the type must be provided to satisfy type inference for the third
parameter)
+/// let flags_array: Option<&StringArray> = None;
+/// // The result is a BooleanArray indicating when each string in `array`
+/// // matches the corresponding regular expression in `regex_array`
+/// let result = regexp_is_match(&array, ®ex_array, flags_array).unwrap();
+/// assert_eq!(result, BooleanArray::from(vec![true, false, true, true]));
+/// ```
+pub fn regexp_is_match<'a, S1, S2, S3>(
+ array: &'a S1,
+ regex_array: &'a S2,
+ flags_array: Option<&'a S3>,
+) -> Result<BooleanArray, ArrowError>
+where
+ &'a S1: StringArrayType<'a>,
+ &'a S2: StringArrayType<'a>,
+ &'a S3: StringArrayType<'a>,
+{
if array.len() != regex_array.len() {
return Err(ArrowError::ComputeError(
"Cannot perform comparison operation on arrays of different
length".to_string(),
));
}
+
let nulls = NullBuffer::union(array.nulls(), regex_array.nulls());
let mut patterns: HashMap<String, Regex> = HashMap::new();
@@ -107,6 +158,7 @@ pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
.nulls(nulls)
.build_unchecked()
};
+
Ok(BooleanArray::from(data))
}
@@ -114,11 +166,47 @@ pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
/// [`LargeStringArray`] and a scalar.
///
/// See the documentation on [`regexp_is_match_utf8`] for more details.
+#[deprecated(since = "54.0.0", note = "please use `regex_is_match_scalar`
instead")]
pub fn regexp_is_match_utf8_scalar<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex: &str,
flag: Option<&str>,
) -> Result<BooleanArray, ArrowError> {
+ regexp_is_match_scalar(array, regex, flag)
+}
+
+/// Return BooleanArray indicating which strings in an array match a single
regular expression.
+///
+/// This is equivalent to the SQL `array ~ regex_array`, supporting
+/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] and a scalar.
+///
+/// See the documentation on [`regexp_is_match`] for more details on arguments
+///
+/// # See Also
+/// * [`regexp_is_match`] for matching an array of regular expression against
an array of strings
+/// * [`regexp_match`] for extracting groups from a string array based on a
regular expression
+///
+/// # Example
+/// ```
+/// # use arrow_array::{StringArray, BooleanArray};
+/// # use arrow_string::regexp::regexp_is_match_scalar;
+/// // array of strings to match
+/// let array = StringArray::from(vec!["Foo", "Bar", "FooBar", "Baz"]);
+/// let regexp = "^Foo"; // regular expression to match against
+/// let flags: Option<&str> = None; // flags can control the matching behavior
+/// // The result is a BooleanArray indicating when each string in `array`
+/// // matches the regular expression `regexp`
+/// let result = regexp_is_match_scalar(&array, regexp, None).unwrap();
+/// assert_eq!(result, BooleanArray::from(vec![true, false, true, false]));
+/// ```
+pub fn regexp_is_match_scalar<'a, S>(
+ array: &'a S,
+ regex: &str,
+ flag: Option<&str>,
+) -> Result<BooleanArray, ArrowError>
+where
+ &'a S: StringArrayType<'a>,
+{
let null_bit_buffer = array.nulls().map(|x| x.inner().sliced());
let mut result = BooleanBufferBuilder::new(array.len());
@@ -126,6 +214,7 @@ pub fn regexp_is_match_utf8_scalar<OffsetSize:
OffsetSizeTrait>(
Some(flag) => format!("(?{flag}){regex}"),
None => regex.to_string(),
};
+
if pattern.is_empty() {
result.append_n(array.len(), true);
} else {
@@ -150,6 +239,7 @@ pub fn regexp_is_match_utf8_scalar<OffsetSize:
OffsetSizeTrait>(
vec![],
)
};
+
Ok(BooleanArray::from(data))
}
@@ -303,6 +393,9 @@ fn regexp_scalar_match<OffsetSize: OffsetSizeTrait>(
/// The flags parameter is an optional text string containing zero or more
single-letter flags
/// that change the function's behavior.
///
+/// # See Also
+/// * [`regexp_is_match`] for matching (rather than extracting) a regular
expression against an array of strings
+///
/// [regexp_match]:
https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP
pub fn regexp_match(
array: &dyn Array,
@@ -517,8 +610,8 @@ mod tests {
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr)
=> {
#[test]
fn $test_name() {
- let left = StringArray::from($left);
- let right = StringArray::from($right);
+ let left = $left;
+ let right = $right;
let res = $op(&left, &right, None).unwrap();
let expected = $expected;
assert_eq!(expected.len(), res.len());
@@ -531,9 +624,9 @@ mod tests {
($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr,
$expected:expr) => {
#[test]
fn $test_name() {
- let left = StringArray::from($left);
- let right = StringArray::from($right);
- let flag = Some(StringArray::from($flag));
+ let left = $left;
+ let right = $right;
+ let flag = Some($flag);
let res = $op(&left, &right, flag.as_ref()).unwrap();
let expected = $expected;
assert_eq!(expected.len(), res.len());
@@ -549,7 +642,7 @@ mod tests {
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr)
=> {
#[test]
fn $test_name() {
- let left = StringArray::from($left);
+ let left = $left;
let res = $op(&left, $right, None).unwrap();
let expected = $expected;
assert_eq!(expected.len(), res.len());
@@ -569,7 +662,7 @@ mod tests {
($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr,
$expected:expr) => {
#[test]
fn $test_name() {
- let left = StringArray::from($left);
+ let left = $left;
let flag = Some($flag);
let res = $op(&left, $right, flag).unwrap();
let expected = $expected;
@@ -590,41 +683,126 @@ mod tests {
}
test_flag_utf8!(
- test_utf8_array_regexp_is_match,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"],
- vec!["^ar", "^AR", "ow$", "OW$", "foo", ""],
- regexp_is_match_utf8,
+ test_array_regexp_is_match_utf8,
+ StringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow",
"arrow"]),
+ StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
+ regexp_is_match::<StringArray, StringArray, StringArray>,
[true, false, true, false, false, true]
);
test_flag_utf8!(
- test_utf8_array_regexp_is_match_insensitive,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"],
- vec!["^ar", "^AR", "ow$", "OW$", "foo", ""],
- vec!["i"; 6],
- regexp_is_match_utf8,
+ test_array_regexp_is_match_utf8_insensitive,
+ StringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow",
"arrow"]),
+ StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
+ StringArray::from(vec!["i"; 6]),
+ regexp_is_match,
[true, true, true, true, false, true]
);
test_flag_utf8_scalar!(
- test_utf8_array_regexp_is_match_scalar,
- vec!["arrow", "ARROW", "parquet", "PARQUET"],
+ test_array_regexp_is_match_utf8_scalar,
+ StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
"^ar",
- regexp_is_match_utf8_scalar,
+ regexp_is_match_scalar,
[true, false, false, false]
);
test_flag_utf8_scalar!(
- test_utf8_array_regexp_is_match_empty_scalar,
- vec!["arrow", "ARROW", "parquet", "PARQUET"],
+ test_array_regexp_is_match_utf8_scalar_empty,
+ StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
"",
- regexp_is_match_utf8_scalar,
+ regexp_is_match_scalar,
[true, true, true, true]
);
test_flag_utf8_scalar!(
- test_utf8_array_regexp_is_match_insensitive_scalar,
- vec!["arrow", "ARROW", "parquet", "PARQUET"],
+ test_array_regexp_is_match_utf8_scalar_insensitive,
+ StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
"^ar",
"i",
- regexp_is_match_utf8_scalar,
+ regexp_is_match_scalar,
+ [true, true, false, false]
+ );
+
+ test_flag_utf8!(
+ tes_array_regexp_is_match,
+ StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow",
"arrow", "arrow"]),
+ StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
+ regexp_is_match::<StringViewArray, StringViewArray, StringViewArray>,
+ [true, false, true, false, false, true]
+ );
+ test_flag_utf8!(
+ test_array_regexp_is_match_2,
+ StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow",
"arrow", "arrow"]),
+ StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
+ regexp_is_match::<StringViewArray, GenericStringArray<i32>,
GenericStringArray<i32>>,
+ [true, false, true, false, false, true]
+ );
+ test_flag_utf8!(
+ test_array_regexp_is_match_insensitive,
+ StringViewArray::from(vec![
+ "Official Rust implementation of Apache Arrow",
+ "apache/arrow-rs",
+ "apache/arrow-rs",
+ "parquet",
+ "parquet",
+ "row",
+ "row",
+ ]),
+ StringViewArray::from(vec![
+ ".*rust implement.*",
+ "^ap",
+ "^AP",
+ "et$",
+ "ET$",
+ "foo",
+ ""
+ ]),
+ StringViewArray::from(vec!["i"; 7]),
+ regexp_is_match::<StringViewArray, StringViewArray, StringViewArray>,
+ [true, true, true, true, true, false, true]
+ );
+ test_flag_utf8!(
+ test_array_regexp_is_match_insensitive_2,
+ LargeStringArray::from(vec!["arrow", "arrow", "arrow", "arrow",
"arrow", "arrow"]),
+ StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
+ StringArray::from(vec!["i"; 6]),
+ regexp_is_match::<GenericStringArray<i64>, StringViewArray,
GenericStringArray<i32>>,
+ [true, true, true, true, false, true]
+ );
+
+ test_flag_utf8_scalar!(
+ test_array_regexp_is_match_scalar,
+ StringViewArray::from(vec![
+ "apache/arrow-rs",
+ "APACHE/ARROW-RS",
+ "parquet",
+ "PARQUET",
+ ]),
+ "^ap",
+ regexp_is_match_scalar::<StringViewArray>,
+ [true, false, false, false]
+ );
+ test_flag_utf8_scalar!(
+ test_array_regexp_is_match_scalar_empty,
+ StringViewArray::from(vec![
+ "apache/arrow-rs",
+ "APACHE/ARROW-RS",
+ "parquet",
+ "PARQUET",
+ ]),
+ "",
+ regexp_is_match_scalar::<StringViewArray>,
+ [true, true, true, true]
+ );
+ test_flag_utf8_scalar!(
+ test_array_regexp_is_match_scalar_insensitive,
+ StringViewArray::from(vec![
+ "apache/arrow-rs",
+ "APACHE/ARROW-RS",
+ "parquet",
+ "PARQUET",
+ ]),
+ "^ap",
+ "i",
+ regexp_is_match_scalar::<StringViewArray>,
[true, true, false, false]
);
}
diff --git a/arrow/benches/comparison_kernels.rs
b/arrow/benches/comparison_kernels.rs
index c8aa7dfcf..4c4a63a77 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -15,19 +15,18 @@
// specific language governing permissions and limitations
// under the License.
+extern crate arrow;
#[macro_use]
extern crate criterion;
-use arrow::util::test_util::seedable_rng;
-use criterion::Criterion;
-
-extern crate arrow;
use arrow::compute::kernels::cmp::*;
use arrow::util::bench_util::*;
+use arrow::util::test_util::seedable_rng;
use arrow::{array::*, datatypes::Float32Type, datatypes::Int32Type};
use arrow_buffer::IntervalMonthDayNano;
use arrow_string::like::*;
-use arrow_string::regexp::regexp_is_match_utf8_scalar;
+use arrow_string::regexp::regexp_is_match_scalar;
+use criterion::Criterion;
use rand::rngs::StdRng;
use rand::Rng;
@@ -53,8 +52,17 @@ fn bench_nilike_utf8_scalar(arr_a: &StringArray, value_b:
&str) {
nilike(arr_a, &StringArray::new_scalar(value_b)).unwrap();
}
-fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
- regexp_is_match_utf8_scalar(
+fn bench_stringview_regexp_is_match_scalar(arr_a: &StringViewArray, value_b:
&str) {
+ regexp_is_match_scalar(
+ criterion::black_box(arr_a),
+ criterion::black_box(value_b),
+ None,
+ )
+ .unwrap();
+}
+
+fn bench_string_regexp_is_match_scalar(arr_a: &StringArray, value_b: &str) {
+ regexp_is_match_scalar(
criterion::black_box(arr_a),
criterion::black_box(value_b),
None,
@@ -78,6 +86,7 @@ fn add_benchmark(c: &mut Criterion) {
let arr_month_day_nano_b = create_month_day_nano_array_with_seed(SIZE,
0.0, 43);
let arr_string = create_string_array::<i32>(SIZE, 0.0);
+ let arr_string_view = create_string_view_array(SIZE, 0.0);
let scalar = Float32Array::from(vec![1.0]);
@@ -343,13 +352,45 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
});
- c.bench_function("regexp_matches_utf8 scalar starts with", |b| {
- b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx"))
- });
+ // StringArray: regexp_matches_utf8 scalar benchmarks
+ let mut group =
+ c.benchmark_group("StringArray: regexp_matches_utf8 scalar
benchmarks".to_string());
- c.bench_function("regexp_matches_utf8 scalar ends with", |b| {
- b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
- });
+ group
+ .bench_function("regexp_matches_utf8 scalar starts with", |b| {
+ b.iter(|| bench_string_regexp_is_match_scalar(&arr_string, "^xx"))
+ })
+ .bench_function("regexp_matches_utf8 scalar contains", |b| {
+ b.iter(|| bench_string_regexp_is_match_scalar(&arr_string,
".*xxXX.*"))
+ })
+ .bench_function("regexp_matches_utf8 scalar ends with", |b| {
+ b.iter(|| bench_string_regexp_is_match_scalar(&arr_string, "xx$"))
+ })
+ .bench_function("regexp_matches_utf8 scalar complex", |b| {
+ b.iter(|| bench_string_regexp_is_match_scalar(&arr_string,
".*x{2}.xX.*xXX"))
+ });
+
+ group.finish();
+
+ // StringViewArray: regexp_matches_utf8view scalar benchmarks
+ group =
+ c.benchmark_group("StringViewArray: regexp_matches_utf8view scalar
benchmarks".to_string());
+
+ group
+ .bench_function("regexp_matches_utf8view scalar starts with", |b| {
+ b.iter(||
bench_stringview_regexp_is_match_scalar(&arr_string_view, "^xx"))
+ })
+ .bench_function("regexp_matches_utf8view scalar contains", |b| {
+ b.iter(||
bench_stringview_regexp_is_match_scalar(&arr_string_view, ".*xxXX.*"))
+ })
+ .bench_function("regexp_matches_utf8view scalar ends with", |b| {
+ b.iter(||
bench_stringview_regexp_is_match_scalar(&arr_string_view, "xx$"))
+ })
+ .bench_function("regexp_matches_utf8view scalar complex", |b| {
+ b.iter(||
bench_stringview_regexp_is_match_scalar(&arr_string_view, ".*x{2}.xX.*xXX"))
+ });
+
+ group.finish();
// DictionaryArray benchmarks
diff --git a/arrow/src/compute/kernels.rs b/arrow/src/compute/kernels.rs
index 4eeb5892c..426952ebb 100644
--- a/arrow/src/compute/kernels.rs
+++ b/arrow/src/compute/kernels.rs
@@ -28,5 +28,8 @@ pub use arrow_string::{concat_elements, length, regexp,
substring};
pub mod comparison {
pub use arrow_ord::comparison::*;
pub use arrow_string::like::*;
+ // continue to export deprecated methods until they are removed
+ pub use arrow_string::regexp::{regexp_is_match, regexp_is_match_scalar};
+ #[allow(deprecated)]
pub use arrow_string::regexp::{regexp_is_match_utf8,
regexp_is_match_utf8_scalar};
}