This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 180776a  Separate parquet writer benchmarks (#818)
180776a is described below

commit 180776ac9bbf920b548f65b01f0323067bd1a529
Author: Wakahisa <[email protected]>
AuthorDate: Thu Oct 7 12:47:38 2021 +0200

    Separate parquet writer benchmarks (#818)
    
    * split benchmarks of primitive arrays
    
    * add list benches
---
 parquet/benches/arrow_writer.rs | 253 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 224 insertions(+), 29 deletions(-)

diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs
index 34ea2d2..f1154eb 100644
--- a/parquet/benches/arrow_writer.rs
+++ b/parquet/benches/arrow_writer.rs
@@ -36,25 +36,164 @@ fn create_primitive_bench_batch(
     true_density: f32,
 ) -> Result<RecordBatch> {
     let fields = vec![
-        Field::new("_1", DataType::Int8, true),
-        Field::new("_2", DataType::Int16, true),
-        Field::new("_3", DataType::Int32, true),
-        Field::new("_4", DataType::Int64, true),
-        Field::new("_5", DataType::UInt8, true),
-        Field::new("_6", DataType::UInt16, true),
-        Field::new("_7", DataType::UInt32, true),
-        Field::new("_8", DataType::UInt64, true),
-        Field::new("_9", DataType::Float32, true),
-        Field::new("_10", DataType::Float64, true),
-        Field::new("_11", DataType::Date32, true),
-        Field::new("_12", DataType::Date64, true),
-        Field::new("_13", DataType::Time32(TimeUnit::Second), true),
-        Field::new("_14", DataType::Time32(TimeUnit::Millisecond), true),
-        Field::new("_15", DataType::Time64(TimeUnit::Microsecond), true),
-        Field::new("_16", DataType::Time64(TimeUnit::Nanosecond), true),
-        Field::new("_17", DataType::Utf8, true),
-        Field::new("_18", DataType::LargeUtf8, true),
-        Field::new("_19", DataType::Boolean, true),
+        Field::new("_1", DataType::Int32, true),
+        Field::new("_2", DataType::Int64, true),
+        Field::new("_3", DataType::UInt32, true),
+        Field::new("_4", DataType::UInt64, true),
+        Field::new("_5", DataType::Float32, true),
+        Field::new("_6", DataType::Float64, true),
+        Field::new("_7", DataType::Date64, true),
+    ];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+fn create_primitive_bench_batch_non_null(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![
+        Field::new("_1", DataType::Int32, false),
+        Field::new("_2", DataType::Int64, false),
+        Field::new("_3", DataType::UInt32, false),
+        Field::new("_4", DataType::UInt64, false),
+        Field::new("_5", DataType::Float32, false),
+        Field::new("_6", DataType::Float64, false),
+        Field::new("_7", DataType::Date64, false),
+    ];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+fn create_string_bench_batch(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![
+        Field::new("_1", DataType::Utf8, true),
+        Field::new("_2", DataType::LargeUtf8, true),
+    ];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+fn create_string_bench_batch_non_null(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![
+        Field::new("_1", DataType::Utf8, false),
+        Field::new("_2", DataType::LargeUtf8, false),
+    ];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+fn create_bool_bench_batch(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![Field::new("_1", DataType::Boolean, true)];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+fn create_bool_bench_batch_non_null(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![Field::new("_1", DataType::Boolean, false)];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+fn create_list_primitive_bench_batch(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![
+        Field::new(
+            "_1",
+            DataType::List(Box::new(Field::new("item", DataType::Int32, 
true))),
+            true,
+        ),
+        Field::new(
+            "_2",
+            DataType::List(Box::new(Field::new("item", DataType::Boolean, 
true))),
+            true,
+        ),
+        Field::new(
+            "_3",
+            DataType::LargeList(Box::new(Field::new("item", DataType::Utf8, 
true))),
+            true,
+        ),
+    ];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
+
+fn create_list_primitive_bench_batch_non_null(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![
+        Field::new(
+            "_1",
+            DataType::List(Box::new(Field::new("item", DataType::Int32, 
false))),
+            false,
+        ),
+        Field::new(
+            "_2",
+            DataType::List(Box::new(Field::new("item", DataType::Boolean, 
false))),
+            false,
+        ),
+        Field::new(
+            "_3",
+            DataType::LargeList(Box::new(Field::new("item", DataType::Utf8, 
false))),
+            false,
+        ),
     ];
     let schema = Schema::new(fields);
     Ok(create_random_batch(
@@ -148,7 +287,7 @@ fn write_batch(batch: &RecordBatch) -> Result<()> {
 }
 
 fn bench_primitive_writer(c: &mut Criterion) {
-    let batch = create_primitive_bench_batch(1024, 0.25, 0.75).unwrap();
+    let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
     let mut group = c.benchmark_group("write_batch primitive");
     group.throughput(Throughput::Bytes(
         batch
@@ -157,9 +296,47 @@ fn bench_primitive_writer(c: &mut Criterion) {
             .map(|f| f.get_array_memory_size() as u64)
             .sum(),
     ));
-    group.bench_function("1024 values", |b| b.iter(|| 
write_batch(&batch).unwrap()));
+    group.bench_function("4096 values primitive", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
 
-    let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
+    let batch = create_primitive_bench_batch_non_null(4096, 0.25, 
0.75).unwrap();
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("4096 values primitive non-null", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
+
+    let batch = create_bool_bench_batch(4096, 0.25, 0.75).unwrap();
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("4096 values bool", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
+
+    let batch = create_bool_bench_batch_non_null(4096, 0.25, 0.75).unwrap();
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("4096 values bool non-null", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
+
+    let batch = create_string_bench_batch(4096, 0.25, 0.75).unwrap();
     group.throughput(Throughput::Bytes(
         batch
             .columns()
@@ -167,14 +344,28 @@ fn bench_primitive_writer(c: &mut Criterion) {
             .map(|f| f.get_array_memory_size() as u64)
             .sum(),
     ));
-    group.bench_function("4096 values", |b| b.iter(|| 
write_batch(&batch).unwrap()));
+    group.bench_function("4096 values string", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
+
+    let batch = create_string_bench_batch_non_null(4096, 0.25, 0.75).unwrap();
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("4096 values string non-null", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
 
     group.finish();
 }
 
 // This bench triggers a write error, it is ignored for now
-fn _bench_nested_writer(c: &mut Criterion) {
-    let batch = _create_nested_bench_batch(1024, 0.25, 0.75).unwrap();
+fn bench_nested_writer(c: &mut Criterion) {
+    let batch = create_list_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
     let mut group = c.benchmark_group("write_batch nested");
     group.throughput(Throughput::Bytes(
         batch
@@ -183,9 +374,11 @@ fn _bench_nested_writer(c: &mut Criterion) {
             .map(|f| f.get_array_memory_size() as u64)
             .sum(),
     ));
-    group.bench_function("1024 values", |b| b.iter(|| 
write_batch(&batch).unwrap()));
+    group.bench_function("4096 values primitive list", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
 
-    let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
+    let batch = create_list_primitive_bench_batch_non_null(4096, 0.25, 
0.75).unwrap();
     group.throughput(Throughput::Bytes(
         batch
             .columns()
@@ -193,10 +386,12 @@ fn _bench_nested_writer(c: &mut Criterion) {
             .map(|f| f.get_array_memory_size() as u64)
             .sum(),
     ));
-    group.bench_function("4096 values", |b| b.iter(|| 
write_batch(&batch).unwrap()));
+    group.bench_function("4096 values primitive list non-null", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
 
     group.finish();
 }
 
-criterion_group!(benches, bench_primitive_writer);
+criterion_group!(benches, bench_primitive_writer, bench_nested_writer);
 criterion_main!(benches);

Reply via email to