This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f3d46398d3 GH-41760: [C++][Parquet] Add file metadata read/write
benchmark (#41761)
f3d46398d3 is described below
commit f3d46398d3c81d9575ffd77ce3b86d4b993a4888
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed May 22 15:06:04 2024 +0200
GH-41760: [C++][Parquet] Add file metadata read/write benchmark (#41761)
Following the discussions on the Parquet ML (see [this
thread](https://lists.apache.org/thread/5jyhzkwyrjk9z52g0b49g31ygnz73gxo) and
[this
thread](https://lists.apache.org/thread/vs3w2z5bk6s3c975rrkqdttr1dpsdn7h)), and
the various complaints about poor Parquet metadata performance on wide schemas,
this adds a benchmark to measure the overhead of Parquet file metadata parsing
or serialization for different numbers of row groups and columns.
Sample output:
```
-----------------------------------------------------------------------------------------------------------------------
Benchmark Time
CPU Iterations UserCounters...
-----------------------------------------------------------------------------------------------------------------------
WriteFileMetadataAndData/num_columns:1/num_row_groups:1 11743 ns
11741 ns 59930 data_size=54 file_size=290
items_per_second=85.1726k/s
WriteFileMetadataAndData/num_columns:1/num_row_groups:100 843137 ns
842920 ns 832 data_size=5.4k file_size=20.486k
items_per_second=1.18635k/s
WriteFileMetadataAndData/num_columns:1/num_row_groups:1000 8232304 ns
8230294 ns 85 data_size=54k file_size=207.687k
items_per_second=121.502/s
WriteFileMetadataAndData/num_columns:10/num_row_groups:1 101214 ns
101190 ns 6910 data_size=540 file_size=2.11k
items_per_second=9.8824k/s
WriteFileMetadataAndData/num_columns:10/num_row_groups:100 8026185 ns
8024361 ns 87 data_size=54k file_size=193.673k
items_per_second=124.621/s
WriteFileMetadataAndData/num_columns:10/num_row_groups:1000 81370293 ns
81343455 ns 8 data_size=540k file_size=1.94392M
items_per_second=12.2936/s
WriteFileMetadataAndData/num_columns:100/num_row_groups:1 955862 ns
955528 ns 733 data_size=5.4k file_size=20.694k
items_per_second=1.04654k/s
WriteFileMetadataAndData/num_columns:100/num_row_groups:100 80115516 ns
80086117 ns 9 data_size=540k file_size=1.94729M
items_per_second=12.4866/s
WriteFileMetadataAndData/num_columns:100/num_row_groups:1000 856428565 ns
856065370 ns 1 data_size=5.4M file_size=19.7673M
items_per_second=1.16814/s
WriteFileMetadataAndData/num_columns:1000/num_row_groups:1 9330003 ns
9327439 ns 75 data_size=54k file_size=211.499k
items_per_second=107.211/s
WriteFileMetadataAndData/num_columns:1000/num_row_groups:100 834609159 ns
834354590 ns 1 data_size=5.4M file_size=19.9623M
items_per_second=1.19853/s
ReadFileMetadata/num_columns:1/num_row_groups:1 3824 ns
3824 ns 182381 data_size=54 file_size=290
items_per_second=261.518k/s
ReadFileMetadata/num_columns:1/num_row_groups:100 88519 ns
88504 ns 7879 data_size=5.4k file_size=20.486k
items_per_second=11.299k/s
ReadFileMetadata/num_columns:1/num_row_groups:1000 849558 ns
849391 ns 825 data_size=54k file_size=207.687k
items_per_second=1.17731k/s
ReadFileMetadata/num_columns:10/num_row_groups:1 19918 ns
19915 ns 35449 data_size=540 file_size=2.11k
items_per_second=50.2138k/s
ReadFileMetadata/num_columns:10/num_row_groups:100 715822 ns
715667 ns 975 data_size=54k file_size=193.673k
items_per_second=1.3973k/s
ReadFileMetadata/num_columns:10/num_row_groups:1000 7017008 ns
7015432 ns 100 data_size=540k file_size=1.94392M
items_per_second=142.543/s
ReadFileMetadata/num_columns:100/num_row_groups:1 175988 ns
175944 ns 3958 data_size=5.4k file_size=20.694k
items_per_second=5.68363k/s
ReadFileMetadata/num_columns:100/num_row_groups:100 6814382 ns
6812781 ns 103 data_size=540k file_size=1.94729M
items_per_second=146.783/s
ReadFileMetadata/num_columns:100/num_row_groups:1000 77858645 ns
77822157 ns 9 data_size=5.4M file_size=19.7673M
items_per_second=12.8498/s
ReadFileMetadata/num_columns:1000/num_row_groups:1 1670001 ns
1669563 ns 419 data_size=54k file_size=211.499k
items_per_second=598.959/s
ReadFileMetadata/num_columns:1000/num_row_groups:100 77339599 ns
77292924 ns 9 data_size=5.4M file_size=19.9623M
items_per_second=12.9378/s
```
* GitHub Issue: #41760
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/CMakeLists.txt | 1 +
cpp/src/parquet/metadata_benchmark.cc | 156 ++++++++++++++++++++++++++++++++++
2 files changed, 157 insertions(+)
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 93f2e72d8d..5ac5085a69 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -432,6 +432,7 @@ add_parquet_benchmark(column_reader_benchmark)
add_parquet_benchmark(column_io_benchmark)
add_parquet_benchmark(encoding_benchmark)
add_parquet_benchmark(level_conversion_benchmark)
+add_parquet_benchmark(metadata_benchmark)
add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc
benchmark_util.cc)
add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow")
diff --git a/cpp/src/parquet/metadata_benchmark.cc
b/cpp/src/parquet/metadata_benchmark.cc
new file mode 100644
index 0000000000..97a99be798
--- /dev/null
+++ b/cpp/src/parquet/metadata_benchmark.cc
@@ -0,0 +1,156 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <sstream>
+
+#include <benchmark/benchmark.h>
+
+#include "arrow/buffer.h"
+#include "arrow/io/memory.h"
+#include "arrow/util/logging.h"
+
+#include "parquet/column_writer.h"
+#include "parquet/file_reader.h"
+#include "parquet/file_writer.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+using ::arrow::Buffer;
+using ::arrow::io::BufferOutputStream;
+using ::arrow::io::BufferReader;
+using schema::GroupNode;
+using schema::NodePtr;
+using schema::NodeVector;
+
+class MetadataBenchmark {
+ public:
+ explicit MetadataBenchmark(benchmark::State* state)
+ : MetadataBenchmark(static_cast<int>(state->range(0)),
+ static_cast<int>(state->range(1))) {}
+
+ MetadataBenchmark(int num_columns, int num_row_groups)
+ : num_columns_(num_columns), num_row_groups_(num_row_groups) {
+ NodeVector fields;
+ for (int i = 0; i < num_columns_; ++i) {
+ std::stringstream ss;
+ ss << "col" << i;
+ fields.push_back(parquet::schema::Int32(ss.str(), Repetition::REQUIRED));
+ }
+ schema_root_ = std::static_pointer_cast<GroupNode>(
+ GroupNode::Make("schema", Repetition::REQUIRED, fields));
+
+ WriterProperties::Builder prop_builder;
+ writer_properties_ = prop_builder.version(ParquetVersion::PARQUET_2_6)
+ ->disable_dictionary()
+ ->data_page_version(ParquetDataPageVersion::V2)
+ ->build();
+ }
+
+ std::shared_ptr<Buffer> WriteFile(benchmark::State* state) {
+ PARQUET_ASSIGN_OR_THROW(auto sink, BufferOutputStream::Create());
+
+ auto writer = ParquetFileWriter::Open(sink, schema_root_,
writer_properties_);
+ std::vector<int32_t> int32_values(1, 42);
+ int64_t data_size = 0;
+ for (int rg = 0; rg < num_row_groups_; ++rg) {
+ auto row_group_writer = writer->AppendRowGroup();
+ for (int col = 0; col < num_columns_; ++col) {
+ auto col_writer = row_group_writer->NextColumn();
+ ARROW_CHECK_EQ(col_writer->type(), Type::INT32);
+ auto typed_col_writer = static_cast<Int32Writer*>(col_writer);
+ typed_col_writer->WriteBatch(
+ /*num_values=*/static_cast<int64_t>(int32_values.size()),
+ /*def_levels=*/nullptr, /*rep_levels=*/nullptr,
int32_values.data());
+ typed_col_writer->Close();
+ }
+ row_group_writer->Close();
+ data_size += row_group_writer->total_compressed_bytes_written();
+ }
+ writer->Close();
+ PARQUET_ASSIGN_OR_THROW(auto buf, sink->Finish());
+ state->counters["file_size"] = static_cast<double>(buf->size());
+ // Note that "data_size" includes the Thrift page headers
+ state->counters["data_size"] = static_cast<double>(data_size);
+ return buf;
+ }
+
+ void ReadFile(std::shared_ptr<Buffer> contents) {
+ auto source = std::make_shared<BufferReader>(contents);
+ ReaderProperties props;
+ auto reader = ParquetFileReader::Open(source, props);
+ auto metadata = reader->metadata();
+ ARROW_CHECK_EQ(metadata->num_columns(), num_columns_);
+ ARROW_CHECK_EQ(metadata->num_row_groups(), num_row_groups_);
+ // There should be one row per row group
+ ARROW_CHECK_EQ(metadata->num_rows(), num_row_groups_);
+ reader->Close();
+ }
+
+ private:
+ int num_columns_;
+ int num_row_groups_;
+ std::shared_ptr<GroupNode> schema_root_;
+ std::shared_ptr<WriterProperties> writer_properties_;
+};
+
+void WriteMetadataSetArgs(benchmark::internal::Benchmark* bench) {
+ bench->ArgNames({"num_columns", "num_row_groups"});
+
+ for (int num_columns : {1, 10, 100}) {
+ for (int num_row_groups : {1, 100, 1000}) {
+ bench->Args({num_columns, num_row_groups});
+ }
+ }
+ /* For larger num_columns, restrict num_row_groups to small values
+ * to avoid blowing up benchmark execution time.
+ */
+ for (int num_row_groups : {1, 100}) {
+ bench->Args({/*num_columns=*/1000, num_row_groups});
+ }
+}
+
+void ReadMetadataSetArgs(benchmark::internal::Benchmark* bench) {
+ WriteMetadataSetArgs(bench);
+}
+
+void WriteFileMetadataAndData(benchmark::State& state) {
+ MetadataBenchmark benchmark(&state);
+
+ for (auto _ : state) {
+ auto sink = benchmark.WriteFile(&state);
+ }
+ state.SetItemsProcessed(state.iterations());
+}
+
+void ReadFileMetadata(benchmark::State& state) {
+ MetadataBenchmark benchmark(&state);
+ auto contents = benchmark.WriteFile(&state);
+
+ for (auto _ : state) {
+ benchmark.ReadFile(contents);
+ }
+ state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(WriteFileMetadataAndData)->Apply(WriteMetadataSetArgs);
+BENCHMARK(ReadFileMetadata)->Apply(ReadMetadataSetArgs);
+
+} // namespace parquet