This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new b8b76bc225 Add benchmark for SUBSTR to evaluate improvements using 
StringView (#12111)
b8b76bc225 is described below

commit b8b76bc225a9b0c51407261cc7b55770db1a958b
Author: kf zheng <[email protected]>
AuthorDate: Fri Aug 23 02:10:55 2024 +0800

    Add benchmark for SUBSTR to evaluate improvements using StringView (#12111)
    
    * add a bench file substr.rs
    
    * taplo format
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/functions/Cargo.toml        |   5 +
 datafusion/functions/benches/substr.rs | 202 +++++++++++++++++++++++++++++++++
 2 files changed, 207 insertions(+)

diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 9ef020b772..337379a746 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -161,3 +161,8 @@ required-features = ["string_expressions"]
 harness = false
 name = "random"
 required-features = ["math_expressions"]
+
+[[bench]]
+harness = false
+name = "substr"
+required-features = ["unicode_expressions"]
diff --git a/datafusion/functions/benches/substr.rs 
b/datafusion/functions/benches/substr.rs
new file mode 100644
index 0000000000..14a3389da3
--- /dev/null
+++ b/datafusion/functions/benches/substr.rs
@@ -0,0 +1,202 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{black_box, criterion_group, criterion_main, Criterion, 
SamplingMode};
+use datafusion_expr::ColumnarValue;
+use datafusion_functions::unicode;
+use std::sync::Arc;
+
+fn create_args_without_count<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    start_half_way: bool,
+    use_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let start_array = Arc::new(Int64Array::from(
+        (0..size)
+            .map(|_| {
+                if start_half_way {
+                    (str_len / 2) as i64
+                } else {
+                    1i64
+                }
+            })
+            .collect::<Vec<_>>(),
+    ));
+
+    if use_string_view {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, 
false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(start_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+        ]
+    }
+}
+
+fn create_args_with_count<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    count_max: usize,
+    use_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let start_array =
+        Arc::new(Int64Array::from((0..size).map(|_| 1).collect::<Vec<_>>()));
+    let count = count_max.min(str_len) as i64;
+    let count_array = Arc::new(Int64Array::from(
+        (0..size).map(|_| count).collect::<Vec<_>>(),
+    ));
+
+    if use_string_view {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, 
false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(start_array),
+            ColumnarValue::Array(count_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+            ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef),
+        ]
+    }
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let substr = unicode::substr();
+    for size in [1024, 4096] {
+        // string_len = 12, substring_len=6 (see `create_args_without_count`)
+        let len = 12;
+        let mut group = c.benchmark_group("SHORTER THAN 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_without_count::<i32>(size, len, true, true);
+        group.bench_function(
+            &format!("substr_string_view [size={}, strlen={}]", size, len),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        let args = create_args_without_count::<i32>(size, len, false, false);
+        group.bench_function(
+            &format!("substr_string [size={}, strlen={}]", size, len),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        let args = create_args_without_count::<i64>(size, len, true, false);
+        group.bench_function(
+            &format!("substr_large_string [size={}, strlen={}]", size, len),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        group.finish();
+
+        // string_len = 128, start=1, count=64, substring_len=64
+        let len = 128;
+        let count = 64;
+        let mut group = c.benchmark_group("LONGER THAN 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true);
+        group.bench_function(
+            &format!(
+                "substr_string_view [size={}, count={}, strlen={}]",
+                size, count, len,
+            ),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false);
+        group.bench_function(
+            &format!(
+                "substr_string [size={}, count={}, strlen={}]",
+                size, count, len,
+            ),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false);
+        group.bench_function(
+            &format!(
+                "substr_large_string [size={}, count={}, strlen={}]",
+                size, count, len,
+            ),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        group.finish();
+
+        // string_len = 128, start=1, count=6, substring_len=6
+        let len = 128;
+        let count = 6;
+        let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true);
+        group.bench_function(
+            &format!(
+                "substr_string_view [size={}, count={}, strlen={}]",
+                size, count, len,
+            ),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false);
+        group.bench_function(
+            &format!(
+                "substr_string [size={}, count={}, strlen={}]",
+                size, count, len,
+            ),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false);
+        group.bench_function(
+            &format!(
+                "substr_large_string [size={}, count={}, strlen={}]",
+                size, count, len,
+            ),
+            |b| b.iter(|| black_box(substr.invoke(&args))),
+        );
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to