This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 180776a Separate parquet writer benchmarks (#818)
180776a is described below
commit 180776ac9bbf920b548f65b01f0323067bd1a529
Author: Wakahisa <[email protected]>
AuthorDate: Thu Oct 7 12:47:38 2021 +0200
Separate parquet writer benchmarks (#818)
* split benchmarks of primitive arrays
* add list benches
---
parquet/benches/arrow_writer.rs | 253 +++++++++++++++++++++++++++++++++++-----
1 file changed, 224 insertions(+), 29 deletions(-)
diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs
index 34ea2d2..f1154eb 100644
--- a/parquet/benches/arrow_writer.rs
+++ b/parquet/benches/arrow_writer.rs
@@ -36,25 +36,164 @@ fn create_primitive_bench_batch(
true_density: f32,
) -> Result<RecordBatch> {
let fields = vec![
- Field::new("_1", DataType::Int8, true),
- Field::new("_2", DataType::Int16, true),
- Field::new("_3", DataType::Int32, true),
- Field::new("_4", DataType::Int64, true),
- Field::new("_5", DataType::UInt8, true),
- Field::new("_6", DataType::UInt16, true),
- Field::new("_7", DataType::UInt32, true),
- Field::new("_8", DataType::UInt64, true),
- Field::new("_9", DataType::Float32, true),
- Field::new("_10", DataType::Float64, true),
- Field::new("_11", DataType::Date32, true),
- Field::new("_12", DataType::Date64, true),
- Field::new("_13", DataType::Time32(TimeUnit::Second), true),
- Field::new("_14", DataType::Time32(TimeUnit::Millisecond), true),
- Field::new("_15", DataType::Time64(TimeUnit::Microsecond), true),
- Field::new("_16", DataType::Time64(TimeUnit::Nanosecond), true),
- Field::new("_17", DataType::Utf8, true),
- Field::new("_18", DataType::LargeUtf8, true),
- Field::new("_19", DataType::Boolean, true),
+ Field::new("_1", DataType::Int32, true),
+ Field::new("_2", DataType::Int64, true),
+ Field::new("_3", DataType::UInt32, true),
+ Field::new("_4", DataType::UInt64, true),
+ Field::new("_5", DataType::Float32, true),
+ Field::new("_6", DataType::Float64, true),
+ Field::new("_7", DataType::Date64, true),
+ ];
+ let schema = Schema::new(fields);
+ Ok(create_random_batch(
+ Arc::new(schema),
+ size,
+ null_density,
+ true_density,
+ )?)
+}
+
+fn create_primitive_bench_batch_non_null(
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<RecordBatch> {
+ let fields = vec![
+ Field::new("_1", DataType::Int32, false),
+ Field::new("_2", DataType::Int64, false),
+ Field::new("_3", DataType::UInt32, false),
+ Field::new("_4", DataType::UInt64, false),
+ Field::new("_5", DataType::Float32, false),
+ Field::new("_6", DataType::Float64, false),
+ Field::new("_7", DataType::Date64, false),
+ ];
+ let schema = Schema::new(fields);
+ Ok(create_random_batch(
+ Arc::new(schema),
+ size,
+ null_density,
+ true_density,
+ )?)
+}
+
+fn create_string_bench_batch(
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<RecordBatch> {
+ let fields = vec![
+ Field::new("_1", DataType::Utf8, true),
+ Field::new("_2", DataType::LargeUtf8, true),
+ ];
+ let schema = Schema::new(fields);
+ Ok(create_random_batch(
+ Arc::new(schema),
+ size,
+ null_density,
+ true_density,
+ )?)
+}
+
+fn create_string_bench_batch_non_null(
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<RecordBatch> {
+ let fields = vec![
+ Field::new("_1", DataType::Utf8, false),
+ Field::new("_2", DataType::LargeUtf8, false),
+ ];
+ let schema = Schema::new(fields);
+ Ok(create_random_batch(
+ Arc::new(schema),
+ size,
+ null_density,
+ true_density,
+ )?)
+}
+
+fn create_bool_bench_batch(
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<RecordBatch> {
+ let fields = vec![Field::new("_1", DataType::Boolean, true)];
+ let schema = Schema::new(fields);
+ Ok(create_random_batch(
+ Arc::new(schema),
+ size,
+ null_density,
+ true_density,
+ )?)
+}
+
+fn create_bool_bench_batch_non_null(
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<RecordBatch> {
+ let fields = vec![Field::new("_1", DataType::Boolean, false)];
+ let schema = Schema::new(fields);
+ Ok(create_random_batch(
+ Arc::new(schema),
+ size,
+ null_density,
+ true_density,
+ )?)
+}
+
+fn create_list_primitive_bench_batch(
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<RecordBatch> {
+ let fields = vec![
+ Field::new(
+ "_1",
+ DataType::List(Box::new(Field::new("item", DataType::Int32,
true))),
+ true,
+ ),
+ Field::new(
+ "_2",
+ DataType::List(Box::new(Field::new("item", DataType::Boolean,
true))),
+ true,
+ ),
+ Field::new(
+ "_3",
+ DataType::LargeList(Box::new(Field::new("item", DataType::Utf8,
true))),
+ true,
+ ),
+ ];
+ let schema = Schema::new(fields);
+ Ok(create_random_batch(
+ Arc::new(schema),
+ size,
+ null_density,
+ true_density,
+ )?)
+}
+
+fn create_list_primitive_bench_batch_non_null(
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<RecordBatch> {
+ let fields = vec![
+ Field::new(
+ "_1",
+ DataType::List(Box::new(Field::new("item", DataType::Int32,
false))),
+ false,
+ ),
+ Field::new(
+ "_2",
+ DataType::List(Box::new(Field::new("item", DataType::Boolean,
false))),
+ false,
+ ),
+ Field::new(
+ "_3",
+ DataType::LargeList(Box::new(Field::new("item", DataType::Utf8,
false))),
+ false,
+ ),
];
let schema = Schema::new(fields);
Ok(create_random_batch(
@@ -148,7 +287,7 @@ fn write_batch(batch: &RecordBatch) -> Result<()> {
}
fn bench_primitive_writer(c: &mut Criterion) {
- let batch = create_primitive_bench_batch(1024, 0.25, 0.75).unwrap();
+ let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
let mut group = c.benchmark_group("write_batch primitive");
group.throughput(Throughput::Bytes(
batch
@@ -157,9 +296,47 @@ fn bench_primitive_writer(c: &mut Criterion) {
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
- group.bench_function("1024 values", |b| b.iter(||
write_batch(&batch).unwrap()));
+ group.bench_function("4096 values primitive", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
- let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
+ let batch = create_primitive_bench_batch_non_null(4096, 0.25,
0.75).unwrap();
+ group.throughput(Throughput::Bytes(
+ batch
+ .columns()
+ .iter()
+ .map(|f| f.get_array_memory_size() as u64)
+ .sum(),
+ ));
+ group.bench_function("4096 values primitive non-null", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
+
+ let batch = create_bool_bench_batch(4096, 0.25, 0.75).unwrap();
+ group.throughput(Throughput::Bytes(
+ batch
+ .columns()
+ .iter()
+ .map(|f| f.get_array_memory_size() as u64)
+ .sum(),
+ ));
+ group.bench_function("4096 values bool", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
+
+ let batch = create_bool_bench_batch_non_null(4096, 0.25, 0.75).unwrap();
+ group.throughput(Throughput::Bytes(
+ batch
+ .columns()
+ .iter()
+ .map(|f| f.get_array_memory_size() as u64)
+ .sum(),
+ ));
+ group.bench_function("4096 values bool non-null", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
+
+ let batch = create_string_bench_batch(4096, 0.25, 0.75).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
@@ -167,14 +344,28 @@ fn bench_primitive_writer(c: &mut Criterion) {
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
- group.bench_function("4096 values", |b| b.iter(||
write_batch(&batch).unwrap()));
+ group.bench_function("4096 values string", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
+
+ let batch = create_string_bench_batch_non_null(4096, 0.25, 0.75).unwrap();
+ group.throughput(Throughput::Bytes(
+ batch
+ .columns()
+ .iter()
+ .map(|f| f.get_array_memory_size() as u64)
+ .sum(),
+ ));
+ group.bench_function("4096 values string non-null", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
group.finish();
}
// This bench triggers a write error, it is ignored for now
-fn _bench_nested_writer(c: &mut Criterion) {
- let batch = _create_nested_bench_batch(1024, 0.25, 0.75).unwrap();
+fn bench_nested_writer(c: &mut Criterion) {
+ let batch = create_list_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
let mut group = c.benchmark_group("write_batch nested");
group.throughput(Throughput::Bytes(
batch
@@ -183,9 +374,11 @@ fn _bench_nested_writer(c: &mut Criterion) {
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
- group.bench_function("1024 values", |b| b.iter(||
write_batch(&batch).unwrap()));
+ group.bench_function("4096 values primitive list", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
- let batch = create_primitive_bench_batch(4096, 0.25, 0.75).unwrap();
+ let batch = create_list_primitive_bench_batch_non_null(4096, 0.25,
0.75).unwrap();
group.throughput(Throughput::Bytes(
batch
.columns()
@@ -193,10 +386,12 @@ fn _bench_nested_writer(c: &mut Criterion) {
.map(|f| f.get_array_memory_size() as u64)
.sum(),
));
- group.bench_function("4096 values", |b| b.iter(||
write_batch(&batch).unwrap()));
+ group.bench_function("4096 values primitive list non-null", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
group.finish();
}
-criterion_group!(benches, bench_primitive_writer);
+criterion_group!(benches, bench_primitive_writer, bench_nested_writer);
criterion_main!(benches);