This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 0c88d13341 GH-39704: [C++][Parquet] Benchmark levels decoding (#39705)
0c88d13341 is described below
commit 0c88d13341dfaba5109683bda25ee3ffcd808080
Author: mwish <[email protected]>
AuthorDate: Tue Feb 6 01:34:37 2024 +0800
GH-39704: [C++][Parquet] Benchmark levels decoding (#39705)
### Rationale for this change
This patch add the level-decoding benchmark. It test:
1. Different max-level (for flat type, maximum level would be 1, for nested
type, it would grows)
2. With different repeat ( repeated null / non-null is different from
non-repeated data)
3. With different read-batch size. This part of logic is a bit tricky in
original code
### What changes are included in this PR?
Add Level decoding benchmark
### Are these changes tested?
No need
### Are there any user-facing changes?
no
* Closes: #39704
Authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/column_reader_benchmark.cc | 98 ++++++++++++++++++++++++++++++
cpp/src/parquet/column_writer_test.cc | 4 +-
2 files changed, 100 insertions(+), 2 deletions(-)
diff --git a/cpp/src/parquet/column_reader_benchmark.cc
b/cpp/src/parquet/column_reader_benchmark.cc
index 49b2317ede..61fe397cf1 100644
--- a/cpp/src/parquet/column_reader_benchmark.cc
+++ b/cpp/src/parquet/column_reader_benchmark.cc
@@ -219,5 +219,103 @@ BENCHMARK(RecordReaderReadRecords)
->Args({2, 1000, true})
->Args({2, 1000, false});
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+ std::vector<int16_t>* levels) {
+ // Generate random levels
+ std::default_random_engine gen(/*seed=*/1943);
+ std::uniform_int_distribution<int16_t> d(0, max_level);
+ for (int i = 0; i < num_levels;) {
+ int16_t current_level = d(gen); // level repeat `level_repeats` times
+ const int current_repeated = std::min(level_repeats, num_levels - i);
+ levels->insert(levels->end(), current_repeated, current_level);
+ i += current_repeated;
+ }
+}
+
+void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
+ const int16_t* input_levels, std::vector<uint8_t>* bytes) {
+ LevelEncoder encoder;
+ // encode levels
+ if (encoding == Encoding::RLE) {
+ int rle_size = LevelEncoder::MaxBufferSize(encoding, max_level,
num_levels);
+ bytes->resize(rle_size + sizeof(int32_t));
+ // leave space to write the rle length value
+ encoder.Init(encoding, max_level, num_levels, bytes->data() +
sizeof(int32_t),
+ rle_size);
+ encoder.Encode(num_levels, input_levels);
+ int data_length = encoder.len();
+ memcpy(bytes->data(), &data_length, sizeof(int32_t));
+ } else {
+ int bitpack_size =
+ LevelEncoder::MaxBufferSize(encoding, max_level, num_levels) +
sizeof(int32_t);
+ bytes->resize(bitpack_size);
+ encoder.Init(encoding, max_level, num_levels, bytes->data(),
+ static_cast<int>(bytes->size()));
+ encoder.Encode(num_levels, input_levels);
+ }
+}
+
+static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int
num_levels,
+ int batch_size, int level_repeat_count,
+ ::benchmark::State& state) {
+ std::vector<uint8_t> bytes;
+ {
+ std::vector<int16_t> input_levels;
+ GenerateLevels(/*level_repeats=*/level_repeat_count,
/*max_repeat_factor=*/max_level,
+ num_levels, &input_levels);
+ EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(),
&bytes);
+ }
+
+ LevelDecoder decoder;
+ std::vector<int16_t> output_levels(batch_size);
+ for (auto _ : state) {
+ state.PauseTiming();
+ decoder.SetData(level_encoding, max_level, num_levels, bytes.data(),
+ static_cast<int>(bytes.size()));
+ state.ResumeTiming();
+ // Decode multiple times with batch_size
+ while (true) {
+ int levels_decoded = decoder.Decode(batch_size, output_levels.data());
+ if (levels_decoded == 0) {
+ break;
+ }
+ }
+ }
+ state.SetBytesProcessed(state.iterations() * num_levels * sizeof(int16_t));
+ state.SetItemsProcessed(state.iterations() * num_levels);
+}
+
+static void ReadLevels_Rle(::benchmark::State& state) {
+ int16_t max_level = static_cast<int16_t>(state.range(0));
+ int num_levels = static_cast<int>(state.range(1));
+ int batch_size = static_cast<int>(state.range(2));
+ int level_repeat_count = static_cast<int>(state.range(3));
+ DecodeLevels(Encoding::RLE, max_level, num_levels, batch_size,
level_repeat_count,
+ state);
+}
+
+static void ReadLevels_BitPack(::benchmark::State& state) {
+ int16_t max_level = static_cast<int16_t>(state.range(0));
+ int num_levels = static_cast<int>(state.range(1));
+ int batch_size = static_cast<int>(state.range(2));
+ int level_repeat_count = static_cast<int>(state.range(3));
+ DecodeLevels(Encoding::BIT_PACKED, max_level, num_levels, batch_size,
+ level_repeat_count, state);
+}
+
+static void ReadLevelsArguments(::benchmark::internal::Benchmark* b) {
+ b->ArgNames({"MaxLevel", "NumLevels", "BatchSize", "LevelRepeatCount"})
+ ->Args({1, 8096, 1024, 1})
+ ->Args({1, 8096, 1024, 7})
+ ->Args({1, 8096, 1024, 1024})
+ ->Args({1, 8096, 2048, 1})
+ ->Args({3, 8096, 1024, 1})
+ ->Args({3, 8096, 2048, 1})
+ ->Args({3, 8096, 1024, 7});
+}
+
+BENCHMARK(ReadLevels_Rle)->Apply(ReadLevelsArguments);
+BENCHMARK(ReadLevels_BitPack)->Apply(ReadLevelsArguments);
+
} // namespace benchmark
} // namespace parquet
diff --git a/cpp/src/parquet/column_writer_test.cc
b/cpp/src/parquet/column_writer_test.cc
index 97421629d2..a40e71ce30 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -1021,7 +1021,7 @@ void EncodeLevels(Encoding::type encoding, int16_t
max_level, int num_levels,
}
void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level,
- std::vector<int16_t>& input_levels,
+ const std::vector<int16_t>& input_levels,
std::vector<uint8_t>& bytes) {
LevelDecoder decoder;
int levels_count = 0;
@@ -1060,7 +1060,7 @@ void VerifyDecodingLevels(Encoding::type encoding,
int16_t max_level,
}
void VerifyDecodingMultipleSetData(Encoding::type encoding, int16_t max_level,
- std::vector<int16_t>& input_levels,
+ const std::vector<int16_t>& input_levels,
std::vector<std::vector<uint8_t>>& bytes) {
LevelDecoder decoder;
int levels_count = 0;