This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 0804ba6072 GH-45045: [C++][Parquet] Add a benchmark for
size_statistics_level (#45085)
0804ba6072 is described below
commit 0804ba60725dc4e4e41ad114a87ac46ae028bd37
Author: Gang Wu <[email protected]>
AuthorDate: Wed Jan 8 17:47:20 2025 +0800
GH-45045: [C++][Parquet] Add a benchmark for size_statistics_level (#45085)
### Rationale for this change
Add a benchmark to know the performance of writing different size stats
levels.
### What changes are included in this PR?
Add a size_stats_benchmark for parquet.
### Are these changes tested?
No
### Are there any user-facing changes?
No
* GitHub Issue: #45045
Lead-authored-by: Gang Wu <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/CMakeLists.txt | 1 +
cpp/src/parquet/arrow/size_stats_benchmark.cc | 165 ++++++++++++++++++++++++++
2 files changed, 166 insertions(+)
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 0a9f92cebb..83eb522484 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -437,3 +437,4 @@ add_parquet_benchmark(metadata_benchmark)
add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc
benchmark_util.cc)
add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow")
+add_parquet_benchmark(arrow/size_stats_benchmark PREFIX "parquet-arrow")
diff --git a/cpp/src/parquet/arrow/size_stats_benchmark.cc
b/cpp/src/parquet/arrow/size_stats_benchmark.cc
new file mode 100644
index 0000000000..d43a3737b1
--- /dev/null
+++ b/cpp/src/parquet/arrow/size_stats_benchmark.cc
@@ -0,0 +1,165 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <cstdint>
+#include <numeric>
+
+#include "parquet/arrow/writer.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+#include "arrow/array.h"
+#include "arrow/io/buffered.h"
+#include "arrow/io/memory.h"
+#include "arrow/table.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+
+namespace parquet::benchmark {
+
+// This should result in multiple pages for most primitive types
+constexpr int64_t kBenchmarkSize = 1024 * 1024;
+// Use a skewed null probability to reduce levels encoding overhead
+constexpr double kNullProbability = 0.95;
+
+int64_t GetTotalBytes(const std::shared_ptr<::arrow::ArrayData>& data) {
+ if (data == nullptr) {
+ return 0;
+ }
+ int64_t total_bytes =
+ std::accumulate(data->buffers.cbegin(), data->buffers.cend(), int64_t{0},
+ [](int64_t acc, const auto& buffer) {
+ return acc + (buffer != nullptr ? buffer->size() :
int64_t{0});
+ });
+ total_bytes += std::accumulate(
+ data->child_data.cbegin(), data->child_data.cend(), int64_t{0},
+ [](int64_t acc, const auto& child) { return acc + GetTotalBytes(child);
});
+ total_bytes += GetTotalBytes(data->dictionary);
+ return total_bytes;
+}
+
+int64_t GetTotalBytes(const std::shared_ptr<::arrow::Table>& table) {
+ int64_t total_bytes = 0;
+ for (const auto& column : table->columns()) {
+ for (const auto& chunk : column->chunks()) {
+ total_bytes += GetTotalBytes(chunk->data());
+ }
+ }
+ return total_bytes;
+}
+
+int64_t GetTotalPageIndexSize(const std::shared_ptr<::parquet::FileMetaData>&
metadata) {
+ int64_t total_page_index_size = 0;
+ for (int i = 0; i < metadata->num_row_groups(); ++i) {
+ auto row_group = metadata->RowGroup(i);
+ for (int j = 0; j < row_group->num_columns(); ++j) {
+ auto column = row_group->ColumnChunk(j);
+ total_page_index_size +=
+ column->GetColumnIndexLocation().value_or(parquet::IndexLocation{0,
0}).length;
+ }
+ }
+ return total_page_index_size;
+}
+
+void WriteColumn(::benchmark::State& state, const
std::shared_ptr<::arrow::Table>& table,
+ SizeStatisticsLevel stats_level) {
+ // Use the fastest possible encoding and compression settings, to better
exhibit
+ // the size statistics overhead.
+ auto properties = WriterProperties::Builder()
+ .enable_statistics()
+ ->enable_write_page_index()
+ ->disable_dictionary()
+ ->encoding(Encoding::PLAIN)
+ ->set_size_statistics_level(stats_level)
+ ->build();
+
+ for (auto _ : state) {
+ auto output = parquet::CreateOutputStream();
+ ARROW_EXPECT_OK(::parquet::arrow::WriteTable(
+ *table, ::arrow::default_memory_pool(),
+ std::static_pointer_cast<::arrow::io::OutputStream>(output),
+ DEFAULT_MAX_ROW_GROUP_LENGTH, properties));
+
+ if (state.counters.find("page_index_size") == state.counters.end()) {
+ state.PauseTiming();
+ auto metadata = parquet::ReadMetaData(
+
std::make_shared<::arrow::io::BufferReader>(output->Finish().ValueOrDie()));
+ state.counters["output_size"] =
static_cast<double>(output->Tell().ValueOrDie());
+ state.counters["page_index_size"] =
+ static_cast<double>(GetTotalPageIndexSize(metadata));
+ state.ResumeTiming();
+ }
+ }
+
+ state.SetItemsProcessed(state.iterations() * kBenchmarkSize);
+ state.SetBytesProcessed(state.iterations() * GetTotalBytes(table));
+}
+
+template <SizeStatisticsLevel level, typename ArrowType>
+void BM_WritePrimitiveColumn(::benchmark::State& state) {
+ ::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
+ auto type = std::make_shared<ArrowType>();
+ auto array = generator.ArrayOf(type, kBenchmarkSize, kNullProbability);
+ auto table = ::arrow::Table::Make(
+ ::arrow::schema({::arrow::field("column", type, kNullProbability > 0)}),
{array});
+ WriteColumn(state, table, level);
+}
+
+template <SizeStatisticsLevel level, typename ArrowType>
+void BM_WriteListColumn(::benchmark::State& state) {
+ ::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
+ auto element_type = std::make_shared<ArrowType>();
+ auto element_array = generator.ArrayOf(element_type, kBenchmarkSize,
kNullProbability);
+ auto list_type = ::arrow::list(element_type);
+ auto list_array = generator.List(*element_array, kBenchmarkSize / 10,
kNullProbability);
+ auto table = ::arrow::Table::Make(
+ ::arrow::schema({::arrow::field("column", list_type, kNullProbability >
0)}),
+ {list_array});
+ WriteColumn(state, table, level);
+}
+
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
+ ::arrow::Int64Type);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
+ ::arrow::Int64Type);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn,
SizeStatisticsLevel::PageAndColumnChunk,
+ ::arrow::Int64Type);
+
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
+ ::arrow::StringType);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
+ ::arrow::StringType);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn,
SizeStatisticsLevel::PageAndColumnChunk,
+ ::arrow::StringType);
+
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None,
::arrow::Int64Type);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
+ ::arrow::Int64Type);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
+ ::arrow::Int64Type);
+
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None,
::arrow::StringType);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
+ ::arrow::StringType);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
+ ::arrow::StringType);
+
+} // namespace parquet::benchmark