This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 99b2b3b92e Add additional benchmarks for utf8view comparison kernels 
(#7351)
99b2b3b92e is described below

commit 99b2b3b92ed31927d7542046db47baccbef67888
Author: Qi Zhu <[email protected]>
AuthorDate: Sun Mar 30 20:18:05 2025 +0800

    Add additional benchmarks for utf8view comparison kernels (#7351)
    
    * Add reproducer cases which the Utf8View will slower than Utf8
    
    * fix fmt
    
    * Add the original format for new benchmark
    
    * fmt
    
    * Fix clippy
---
 arrow/benches/comparison_kernels.rs | 97 ++++++++++++++++++++++++++++++++++--
 arrow/src/util/bench_util.rs        | 98 +++++++++++++++++++++++++++++++++++++
 2 files changed, 191 insertions(+), 4 deletions(-)

diff --git a/arrow/benches/comparison_kernels.rs 
b/arrow/benches/comparison_kernels.rs
index 84fd47acc1..c12fd2ad35 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -32,14 +32,14 @@ use rand::Rng;
 
 const SIZE: usize = 65536;
 
-fn bench_like_utf8view_scalar(arr_a: &StringViewArray, value_b: &str) {
-    like(arr_a, &StringViewArray::new_scalar(value_b)).unwrap();
-}
-
 fn bench_like_utf8_scalar(arr_a: &StringArray, value_b: &str) {
     like(arr_a, &StringArray::new_scalar(value_b)).unwrap();
 }
 
+fn bench_like_utf8view_scalar(arr_a: &StringViewArray, value_b: &str) {
+    like(arr_a, &StringViewArray::new_scalar(value_b)).unwrap();
+}
+
 fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
     nlike(arr_a, &StringArray::new_scalar(value_b)).unwrap();
 }
@@ -88,6 +88,16 @@ fn add_benchmark(c: &mut Criterion) {
     let arr_string = create_string_array::<i32>(SIZE, 0.0);
     let arr_string_view = create_string_view_array(SIZE, 0.0);
 
+    // create long string arrays with the same prefix
+    let arr_long_string = 
create_longer_string_array_with_same_prefix::<i32>(SIZE, 0.0);
+    let arr_long_string_view = 
create_longer_string_view_array_with_same_prefix(SIZE, 0.0);
+
+    let left_arr_long_string = 
create_longer_string_array_with_same_prefix::<i32>(SIZE, 0.0);
+    let right_arr_long_string = 
create_longer_string_array_with_same_prefix::<i32>(SIZE, 0.0);
+
+    let left_arr_long_string_view = 
create_longer_string_view_array_with_same_prefix(SIZE, 0.0);
+    let right_arr_long_string_view = 
create_longer_string_view_array_with_same_prefix(SIZE, 0.0);
+
     let scalar = Float32Array::from(vec![1.0]);
 
     // eq benchmarks
@@ -225,6 +235,31 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| eq(&string_view_left, &string_view_right).unwrap())
     });
 
+    // eq benchmarks for long strings with the same prefix
+    c.bench_function("eq long same prefix strings StringArray", |b| {
+        b.iter(|| eq(&left_arr_long_string, &right_arr_long_string).unwrap())
+    });
+
+    c.bench_function("neq long same prefix strings StringArray", |b| {
+        b.iter(|| neq(&left_arr_long_string, &right_arr_long_string).unwrap())
+    });
+
+    c.bench_function("lt long same prefix strings StringArray", |b| {
+        b.iter(|| lt(&left_arr_long_string, &right_arr_long_string).unwrap())
+    });
+
+    c.bench_function("eq long same prefix strings StringViewArray", |b| {
+        b.iter(|| eq(&left_arr_long_string_view, 
&right_arr_long_string_view).unwrap())
+    });
+
+    c.bench_function("neq long same prefix strings StringViewArray", |b| {
+        b.iter(|| neq(&left_arr_long_string_view, 
&right_arr_long_string_view).unwrap())
+    });
+
+    c.bench_function("lt long same prefix strings StringViewArray", |b| {
+        b.iter(|| lt(&left_arr_long_string_view, 
&right_arr_long_string_view).unwrap())
+    });
+
     // StringArray: LIKE benchmarks
 
     c.bench_function("like_utf8 scalar equals", |b| {
@@ -247,6 +282,60 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_like_utf8_scalar(&arr_string, "%xx_xx%xxx"))
     });
 
+    // StringArray: LIKE benchmarks with long strings 4 bytes prefix
+    // Note:
+    // long strings mean strings start with same 4 bytes prefix such as "test",
+    // followed by a tail, ensuring the total length is greater than 12 bytes.
+    c.bench_function("long same prefix strings like_utf8 scalar equals", |b| {
+        b.iter(|| bench_like_utf8_scalar(&arr_long_string, "prefix_1234"))
+    });
+
+    c.bench_function("long same prefix strings like_utf8 scalar contains", |b| 
{
+        b.iter(|| bench_like_utf8_scalar(&arr_long_string, "%prefix_1234%"))
+    });
+
+    c.bench_function("long same prefix strings like_utf8 scalar ends with", 
|b| {
+        b.iter(|| bench_like_utf8_scalar(&arr_long_string, "%prefix_1234"))
+    });
+
+    c.bench_function(
+        "long same prefix strings like_utf8 scalar starts with",
+        |b| b.iter(|| bench_like_utf8_scalar(&arr_long_string, 
"prefix_1234%")),
+    );
+
+    c.bench_function("long same prefix strings like_utf8 scalar complex", |b| {
+        b.iter(|| bench_like_utf8_scalar(&arr_long_string, "%prefix_1234%xxx"))
+    });
+
+    // StringViewArray: LIKE benchmarks with long strings 4 bytes prefix
+    // Note:
+    // long strings mean strings start with same 4 bytes prefix such as "test",
+    // followed by a tail, ensuring the total length is greater than 12 bytes.
+    c.bench_function(
+        "long same prefix strings like_utf8view scalar equals",
+        |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view, 
"prefix_1234")),
+    );
+
+    c.bench_function(
+        "long same prefix strings like_utf8view scalar contains",
+        |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view, 
"%prefix_1234%")),
+    );
+
+    c.bench_function(
+        "long same prefix strings like_utf8view scalar ends with",
+        |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view, 
"%prefix_1234")),
+    );
+
+    c.bench_function(
+        "long same prefix strings like_utf8view scalar starts with",
+        |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view, 
"prefix_1234%")),
+    );
+
+    c.bench_function(
+        "long same prefix strings like_utf8view scalar complex",
+        |b| b.iter(|| bench_like_utf8view_scalar(&arr_long_string_view, 
"%prefix_1234%xxx")),
+    );
+
     // StringViewArray: LIKE benchmarks
     // Note: since like/nlike share the same implementation, we only benchmark 
one
     c.bench_function("like_utf8view scalar equals", |b| {
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
index 387d9b973a..2f0ccf2add 100644
--- a/arrow/src/util/bench_util.rs
+++ b/arrow/src/util/bench_util.rs
@@ -129,6 +129,104 @@ pub fn create_string_array<Offset: OffsetSizeTrait>(
     create_string_array_with_max_len(size, null_density, 400)
 }
 
+/// Creates longer string array with same prefix, the prefix should be larger 
than 4 bytes,
+/// and the string length should be larger than 12 bytes
+/// so that we can compare the performance with StringViewArray, because 
StringViewArray has 4 bytes inline for view
+pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
+    size: usize,
+    null_density: f32,
+) -> GenericStringArray<Offset> {
+    create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, 
"prefix_")
+}
+
+/// Creates longer string view array with same prefix, the prefix should be 
larger than 4 bytes,
+/// and the string length should be larger than 12 bytes
+/// so that we can compare the StringArray performance with StringViewArray, 
because StringViewArray has 4 bytes inline for view
+pub fn create_longer_string_view_array_with_same_prefix(
+    size: usize,
+    null_density: f32,
+) -> StringViewArray {
+    create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 
100, "prefix_")
+}
+
+fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
+    size: usize,
+    null_density: f32,
+    min_str_len: usize,
+    max_str_len: usize,
+    prefix: &str,
+) -> GenericStringArray<Offset> {
+    assert!(
+        min_str_len <= max_str_len,
+        "min_str_len must be <= max_str_len"
+    );
+    assert!(
+        prefix.len() <= max_str_len,
+        "Prefix length must be <= max_str_len"
+    );
+
+    let rng = &mut seedable_rng();
+    (0..size)
+        .map(|_| {
+            if rng.random::<f32>() < null_density {
+                None
+            } else {
+                let remaining_len = rng.random_range(
+                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - 
prefix.len()),
+                );
+
+                let mut value = prefix.to_string();
+                value.extend(
+                    rng.sample_iter(&Alphanumeric)
+                        .take(remaining_len)
+                        .map(char::from),
+                );
+
+                Some(value)
+            }
+        })
+        .collect()
+}
+
+fn create_string_view_array_with_len_range_and_prefix(
+    size: usize,
+    null_density: f32,
+    min_str_len: usize,
+    max_str_len: usize,
+    prefix: &str,
+) -> StringViewArray {
+    assert!(
+        min_str_len <= max_str_len,
+        "min_str_len must be <= max_str_len"
+    );
+    assert!(
+        prefix.len() <= max_str_len,
+        "Prefix length must be <= max_str_len"
+    );
+
+    let rng = &mut seedable_rng();
+    (0..size)
+        .map(|_| {
+            if rng.random::<f32>() < null_density {
+                None
+            } else {
+                let remaining_len = rng.random_range(
+                    min_str_len.saturating_sub(prefix.len())..=(max_str_len - 
prefix.len()),
+                );
+
+                let mut value = prefix.to_string();
+                value.extend(
+                    rng.sample_iter(&Alphanumeric)
+                        .take(remaining_len)
+                        .map(char::from),
+                );
+
+                Some(value)
+            }
+        })
+        .collect()
+}
+
 /// Creates a random (but fixed-seeded) array of rand size with a given max 
size, null density and length
 fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
     size: usize,

Reply via email to