This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 658e58f88 add benchmark to track performance (#6101)
658e58f88 is described below

commit 658e58f8822926cdc53cd0af89a84d53257feb22
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jul 22 16:28:51 2024 -0400

    add benchmark to track performance (#6101)
---
 parquet/benches/arrow_reader.rs | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs
index 927998ac2..814e75c24 100644
--- a/parquet/benches/arrow_reader.rs
+++ b/parquet/benches/arrow_reader.rs
@@ -263,9 +263,10 @@ where
     InMemoryPageIterator::new(pages)
 }
 
-fn build_plain_encoded_byte_array_page_iterator(
+fn build_plain_encoded_byte_array_page_iterator_inner(
     column_desc: ColumnDescPtr,
     null_density: f32,
+    short_string: bool,
 ) -> impl PageIterator + Clone {
     let max_def_level = column_desc.max_def_level();
     let max_rep_level = column_desc.max_rep_level();
@@ -285,7 +286,11 @@ fn build_plain_encoded_byte_array_page_iterator(
                     max_def_level
                 };
                 if def_level == max_def_level {
-                    let string_value = format!("Test value {k}, row group: 
{i}, page: {j}");
+                    let string_value = if short_string {
+                        format!("{k}{i}{j}")
+                    } else {
+                        format!("Test value {k}, row group: {i}, page: {j}")
+                    };
                     
values.push(parquet::data_type::ByteArray::from(string_value.as_str()));
                 }
                 def_levels.push(def_level);
@@ -303,6 +308,13 @@ fn build_plain_encoded_byte_array_page_iterator(
     InMemoryPageIterator::new(pages)
 }
 
+fn build_plain_encoded_byte_array_page_iterator(
+    column_desc: ColumnDescPtr,
+    null_density: f32,
+) -> impl PageIterator + Clone {
+    build_plain_encoded_byte_array_page_iterator_inner(column_desc, 
null_density, false)
+}
+
 fn build_dictionary_encoded_string_page_iterator(
     column_desc: ColumnDescPtr,
     null_density: f32,
@@ -1066,6 +1078,27 @@ fn add_benches(c: &mut Criterion) {
 
     let mut group = c.benchmark_group("arrow_array_reader/BinaryViewArray");
 
+    // binary view, plain encoded, no NULLs, short string
+    let plain_byte_array_no_null_data = 
build_plain_encoded_byte_array_page_iterator_inner(
+        mandatory_binary_column_desc.clone(),
+        0.0,
+        true,
+    );
+
+    // Short strings should not be slower than long strings, however, as 
discussed in https://github.com/apache/arrow-rs/issues/6034,
+    // the current implementation is more than 2x slower.
+    // This benchmark tracks the performance of short strings so that we can 
optimize it.
+    group.bench_function("plain encoded, mandatory, no NULLs, short string", 
|b| {
+        b.iter(|| {
+            let array_reader = create_byte_view_array_reader(
+                plain_byte_array_no_null_data.clone(),
+                mandatory_binary_column_desc.clone(),
+            );
+            count = bench_array_reader(array_reader);
+        });
+        assert_eq!(count, EXPECTED_VALUE_COUNT);
+    });
+
     // binary view, plain encoded, no NULLs
     let plain_byte_array_no_null_data =
         
build_plain_encoded_byte_array_page_iterator(mandatory_binary_column_desc.clone(),
 0.0);

Reply via email to