This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 0c88d13341 GH-39704: [C++][Parquet] Benchmark levels decoding  (#39705)
0c88d13341 is described below

commit 0c88d13341dfaba5109683bda25ee3ffcd808080
Author: mwish <[email protected]>
AuthorDate: Tue Feb 6 01:34:37 2024 +0800

    GH-39704: [C++][Parquet] Benchmark levels decoding  (#39705)
    
    
    
    ### Rationale for this change
    
    This patch add the level-decoding benchmark. It test:
    1. Different max-level (for flat type, maximum level would be 1, for nested 
type, it would grows)
    2. With different repeat ( repeated null / non-null is different from 
non-repeated data)
    3. With different read-batch size. This part of logic is a bit tricky in 
original code
    
    ### What changes are included in this PR?
    
    Add Level decoding benchmark
    
    ### Are these changes tested?
    
    No need
    
    ### Are there any user-facing changes?
    
    no
    
    * Closes: #39704
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/column_reader_benchmark.cc | 98 ++++++++++++++++++++++++++++++
 cpp/src/parquet/column_writer_test.cc      |  4 +-
 2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/cpp/src/parquet/column_reader_benchmark.cc 
b/cpp/src/parquet/column_reader_benchmark.cc
index 49b2317ede..61fe397cf1 100644
--- a/cpp/src/parquet/column_reader_benchmark.cc
+++ b/cpp/src/parquet/column_reader_benchmark.cc
@@ -219,5 +219,103 @@ BENCHMARK(RecordReaderReadRecords)
     ->Args({2, 1000, true})
     ->Args({2, 1000, false});
 
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+                    std::vector<int16_t>* levels) {
+  // Generate random levels
+  std::default_random_engine gen(/*seed=*/1943);
+  std::uniform_int_distribution<int16_t> d(0, max_level);
+  for (int i = 0; i < num_levels;) {
+    int16_t current_level = d(gen);  // level repeat `level_repeats` times
+    const int current_repeated = std::min(level_repeats, num_levels - i);
+    levels->insert(levels->end(), current_repeated, current_level);
+    i += current_repeated;
+  }
+}
+
+void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
+                  const int16_t* input_levels, std::vector<uint8_t>* bytes) {
+  LevelEncoder encoder;
+  // encode levels
+  if (encoding == Encoding::RLE) {
+    int rle_size = LevelEncoder::MaxBufferSize(encoding, max_level, 
num_levels);
+    bytes->resize(rle_size + sizeof(int32_t));
+    // leave space to write the rle length value
+    encoder.Init(encoding, max_level, num_levels, bytes->data() + 
sizeof(int32_t),
+                 rle_size);
+    encoder.Encode(num_levels, input_levels);
+    int data_length = encoder.len();
+    memcpy(bytes->data(), &data_length, sizeof(int32_t));
+  } else {
+    int bitpack_size =
+        LevelEncoder::MaxBufferSize(encoding, max_level, num_levels) + 
sizeof(int32_t);
+    bytes->resize(bitpack_size);
+    encoder.Init(encoding, max_level, num_levels, bytes->data(),
+                 static_cast<int>(bytes->size()));
+    encoder.Encode(num_levels, input_levels);
+  }
+}
+
+static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int 
num_levels,
+                         int batch_size, int level_repeat_count,
+                         ::benchmark::State& state) {
+  std::vector<uint8_t> bytes;
+  {
+    std::vector<int16_t> input_levels;
+    GenerateLevels(/*level_repeats=*/level_repeat_count, 
/*max_repeat_factor=*/max_level,
+                   num_levels, &input_levels);
+    EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(), 
&bytes);
+  }
+
+  LevelDecoder decoder;
+  std::vector<int16_t> output_levels(batch_size);
+  for (auto _ : state) {
+    state.PauseTiming();
+    decoder.SetData(level_encoding, max_level, num_levels, bytes.data(),
+                    static_cast<int>(bytes.size()));
+    state.ResumeTiming();
+    // Decode multiple times with batch_size
+    while (true) {
+      int levels_decoded = decoder.Decode(batch_size, output_levels.data());
+      if (levels_decoded == 0) {
+        break;
+      }
+    }
+  }
+  state.SetBytesProcessed(state.iterations() * num_levels * sizeof(int16_t));
+  state.SetItemsProcessed(state.iterations() * num_levels);
+}
+
+static void ReadLevels_Rle(::benchmark::State& state) {
+  int16_t max_level = static_cast<int16_t>(state.range(0));
+  int num_levels = static_cast<int>(state.range(1));
+  int batch_size = static_cast<int>(state.range(2));
+  int level_repeat_count = static_cast<int>(state.range(3));
+  DecodeLevels(Encoding::RLE, max_level, num_levels, batch_size, 
level_repeat_count,
+               state);
+}
+
+static void ReadLevels_BitPack(::benchmark::State& state) {
+  int16_t max_level = static_cast<int16_t>(state.range(0));
+  int num_levels = static_cast<int>(state.range(1));
+  int batch_size = static_cast<int>(state.range(2));
+  int level_repeat_count = static_cast<int>(state.range(3));
+  DecodeLevels(Encoding::BIT_PACKED, max_level, num_levels, batch_size,
+               level_repeat_count, state);
+}
+
+static void ReadLevelsArguments(::benchmark::internal::Benchmark* b) {
+  b->ArgNames({"MaxLevel", "NumLevels", "BatchSize", "LevelRepeatCount"})
+      ->Args({1, 8096, 1024, 1})
+      ->Args({1, 8096, 1024, 7})
+      ->Args({1, 8096, 1024, 1024})
+      ->Args({1, 8096, 2048, 1})
+      ->Args({3, 8096, 1024, 1})
+      ->Args({3, 8096, 2048, 1})
+      ->Args({3, 8096, 1024, 7});
+}
+
+BENCHMARK(ReadLevels_Rle)->Apply(ReadLevelsArguments);
+BENCHMARK(ReadLevels_BitPack)->Apply(ReadLevelsArguments);
+
 }  // namespace benchmark
 }  // namespace parquet
diff --git a/cpp/src/parquet/column_writer_test.cc 
b/cpp/src/parquet/column_writer_test.cc
index 97421629d2..a40e71ce30 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -1021,7 +1021,7 @@ void EncodeLevels(Encoding::type encoding, int16_t 
max_level, int num_levels,
 }
 
 void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level,
-                          std::vector<int16_t>& input_levels,
+                          const std::vector<int16_t>& input_levels,
                           std::vector<uint8_t>& bytes) {
   LevelDecoder decoder;
   int levels_count = 0;
@@ -1060,7 +1060,7 @@ void VerifyDecodingLevels(Encoding::type encoding, 
int16_t max_level,
 }
 
 void VerifyDecodingMultipleSetData(Encoding::type encoding, int16_t max_level,
-                                   std::vector<int16_t>& input_levels,
+                                   const std::vector<int16_t>& input_levels,
                                    std::vector<std::vector<uint8_t>>& bytes) {
   LevelDecoder decoder;
   int levels_count = 0;

Reply via email to