This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 99b2b3b92e Add additional benchmarks for utf8view comparison kernels
(#7351)
99b2b3b92e is described below
commit 99b2b3b92ed31927d7542046db47baccbef67888
Author: Qi Zhu <[email protected]>
AuthorDate: Sun Mar 30 20:18:05 2025 +0800
Add additional benchmarks for utf8view comparison kernels (#7351)
* Add reproducer cases which the Utf8View will slower than Utf8
* fix fmt
* Add the original format for new benchmark
* fmt
* Fix clippy
---
arrow/benches/comparison_kernels.rs | 97 ++++++++++++++++++++++++++++++++++--
arrow/src/util/bench_util.rs | 98 +++++++++++++++++++++++++++++++++++++
2 files changed, 191 insertions(+), 4 deletions(-)
diff --git a/arrow/benches/comparison_kernels.rs
b/arrow/benches/comparison_kernels.rs
index 84fd47acc1..c12fd2ad35 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -32,14 +32,14 @@ use rand::Rng;
const SIZE: usize = 65536;
-fn bench_like_utf8view_scalar(arr_a: &StringViewArray, value_b: &str) {
- like(arr_a, &StringViewArray::new_scalar(value_b)).unwrap();
-}
-
fn bench_like_utf8_scalar(arr_a: &StringArray, value_b: &str) {
like(arr_a, &StringArray::new_scalar(value_b)).unwrap();
}
+fn bench_like_utf8view_scalar(arr_a: &StringViewArray, value_b: &str) {
+ like(arr_a, &StringViewArray::new_scalar(value_b)).unwrap();
+}
+
fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
nlike(arr_a, &StringArray::new_scalar(value_b)).unwrap();
}
@@ -88,6 +88,16 @@ fn add_benchmark(c: &mut Criterion) {
let arr_string = create_string_array::<i32>(SIZE, 0.0);
let arr_string_view = create_string_view_array(SIZE, 0.0);
+ // create long string arrays with the same prefix
+ let arr_long_string =
create_longer_string_array_with_same_prefix::<i32>(SIZE, 0.0);
+ let arr_long_string_view =
create_longer_string_view_array_with_same_prefix(SIZE, 0.0);
+
+ let left_arr_long_string =
create_longer_string_array_with_same_prefix::<i32>(SIZE, 0.0);
+ let right_arr_long_string =
create_longer_string_array_with_same_prefix::<i32>(SIZE, 0.0);
+
+ let left_arr_long_string_view =
create_longer_string_view_array_with_same_prefix(SIZE, 0.0);
+ let right_arr_long_string_view =
create_longer_string_view_array_with_same_prefix(SIZE, 0.0);
+
let scalar = Float32Array::from(vec![1.0]);
// eq benchmarks
@@ -225,6 +235,31 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| eq(&string_view_left, &string_view_right).unwrap())
});
+ // eq benchmarks for long strings with the same prefix
+ c.bench_function("eq long same prefix strings StringArray", |b| {
+ b.iter(|| eq(&left_arr_long_string, &right_arr_long_string).unwrap())
+ });
+
+ c.bench_function("neq long same prefix strings StringArray", |b| {
+ b.iter(|| neq(&left_arr_long_string, &right_arr_long_string).unwrap())
+ });
+
+ c.bench_function("lt long same prefix strings StringArray", |b| {
+ b.iter(|| lt(&left_arr_long_string, &right_arr_long_string).unwrap())
+ });
+
+ c.bench_function("eq long same prefix strings StringViewArray", |b| {
+ b.iter(|| eq(&left_arr_long_string_view,
&right_arr_long_string_view).unwrap())
+ });
+
+ c.bench_function("neq long same prefix strings StringViewArray", |b| {
+ b.iter(|| neq(&left_arr_long_string_view,
&right_arr_long_string_view).unwrap())
+ });
+
+ c.bench_function("lt long same prefix strings StringViewArray", |b| {
+ b.iter(|| lt(&left_arr_long_string_view,
&right_arr_long_string_view).unwrap())
+ });
+
// StringArray: LIKE benchmarks
c.bench_function("like_utf8 scalar equals", |b| {
@@ -247,6 +282,60 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_like_utf8_scalar(&arr_string, "%xx_xx%xxx"))
});
+ // StringArray: LIKE benchmarks with long strings 4 bytes prefix
+ // Note:
+ // long strings mean strings start with same 4 bytes prefix such as "test",
+ // followed by a tail, ensuring the total length is greater than 12 bytes.
+ c.bench_function("long same prefix strings like_utf8 scalar equals", |b| {
+ b.iter(|| bench_like_utf8_scalar(&arr_long_string, "prefix_1234"))
+ });
+
+ c.bench_function("long same prefix strings like_utf8 scalar contains", |b|
{
+ b.iter(|| bench_like_utf8_scalar(&arr_long_string, "%prefix_1234%"))
+ });
+
+ c.bench_function("long same prefix strings like_utf8 scalar ends with",
|b| {
+ b.iter(|| bench_like_utf8_scalar(&arr_long_string, "%prefix_1234"))
+ });
+
+ c.bench_function(
+ "long same prefix strings like_utf8 scalar starts with",
+ |b| b.iter(|| bench_like_utf8_scalar(&arr_long_string,
"prefix_1234%")),
+ );
+
+ c.bench_function("long same prefix strings like_utf8 scalar complex", |b| {
+ b.iter(|| bench_like_utf8_scalar(&arr_long_string, "%prefix_1234%xxx"))
+ });
+
+ // StringViewArray: LIKE benchmarks with long strings 4 bytes prefix
+ // Note:
+ // long strings mean strings start with same 4 bytes prefix such as "test",
+ // followed by a tail, ensuring the total length is greater than 12 bytes.
+ c.bench_function(
+ "long same prefix strings like_utf8view scalar equals",
+ |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view,
"prefix_1234")),
+ );
+
+ c.bench_function(
+ "long same prefix strings like_utf8view scalar contains",
+ |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view,
"%prefix_1234%")),
+ );
+
+ c.bench_function(
+ "long same prefix strings like_utf8view scalar ends with",
+ |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view,
"%prefix_1234")),
+ );
+
+ c.bench_function(
+ "long same prefix strings like_utf8view scalar starts with",
+ |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view,
"prefix_1234%")),
+ );
+
+ c.bench_function(
+ "long same prefix strings like_utf8view scalar complex",
+ |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view,
"%prefix_1234%xxx")),
+ );
+
// StringViewArray: LIKE benchmarks
// Note: since like/nlike share the same implementation, we only benchmark
one
c.bench_function("like_utf8view scalar equals", |b| {
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
index 387d9b973a..2f0ccf2add 100644
--- a/arrow/src/util/bench_util.rs
+++ b/arrow/src/util/bench_util.rs
@@ -129,6 +129,104 @@ pub fn create_string_array<Offset: OffsetSizeTrait>(
create_string_array_with_max_len(size, null_density, 400)
}
+/// Creates longer string array with same prefix, the prefix should be larger
than 4 bytes,
+/// and the string length should be larger than 12 bytes
+/// so that we can compare the performance with StringViewArray, because
StringViewArray has 4 bytes inline for view
+pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
+ size: usize,
+ null_density: f32,
+) -> GenericStringArray<Offset> {
+ create_string_array_with_len_range_and_prefix(size, null_density, 13, 100,
"prefix_")
+}
+
+/// Creates longer string view array with same prefix, the prefix should be
larger than 4 bytes,
+/// and the string length should be larger than 12 bytes
+/// so that we can compare the StringArray performance with StringViewArray,
because StringViewArray has 4 bytes inline for view
+pub fn create_longer_string_view_array_with_same_prefix(
+ size: usize,
+ null_density: f32,
+) -> StringViewArray {
+ create_string_view_array_with_len_range_and_prefix(size, null_density, 13,
100, "prefix_")
+}
+
+fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
+ size: usize,
+ null_density: f32,
+ min_str_len: usize,
+ max_str_len: usize,
+ prefix: &str,
+) -> GenericStringArray<Offset> {
+ assert!(
+ min_str_len <= max_str_len,
+ "min_str_len must be <= max_str_len"
+ );
+ assert!(
+ prefix.len() <= max_str_len,
+ "Prefix length must be <= max_str_len"
+ );
+
+ let rng = &mut seedable_rng();
+ (0..size)
+ .map(|_| {
+ if rng.random::<f32>() < null_density {
+ None
+ } else {
+ let remaining_len = rng.random_range(
+ min_str_len.saturating_sub(prefix.len())..=(max_str_len -
prefix.len()),
+ );
+
+ let mut value = prefix.to_string();
+ value.extend(
+ rng.sample_iter(&Alphanumeric)
+ .take(remaining_len)
+ .map(char::from),
+ );
+
+ Some(value)
+ }
+ })
+ .collect()
+}
+
+fn create_string_view_array_with_len_range_and_prefix(
+ size: usize,
+ null_density: f32,
+ min_str_len: usize,
+ max_str_len: usize,
+ prefix: &str,
+) -> StringViewArray {
+ assert!(
+ min_str_len <= max_str_len,
+ "min_str_len must be <= max_str_len"
+ );
+ assert!(
+ prefix.len() <= max_str_len,
+ "Prefix length must be <= max_str_len"
+ );
+
+ let rng = &mut seedable_rng();
+ (0..size)
+ .map(|_| {
+ if rng.random::<f32>() < null_density {
+ None
+ } else {
+ let remaining_len = rng.random_range(
+ min_str_len.saturating_sub(prefix.len())..=(max_str_len -
prefix.len()),
+ );
+
+ let mut value = prefix.to_string();
+ value.extend(
+ rng.sample_iter(&Alphanumeric)
+ .take(remaining_len)
+ .map(char::from),
+ );
+
+ Some(value)
+ }
+ })
+ .collect()
+}
+
/// Creates a random (but fixed-seeded) array of rand size with a given max
size, null density and length
fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
size: usize,