This is an automated email from the ASF dual-hosted git repository.

wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 22f2980e9b GH-34322: [C++][Parquet] Encoding Microbench for ByteArray 
(#34323)
22f2980e9b is described below

commit 22f2980e9ba5de684c51d6e92a74d449cb405e3d
Author: mwish <[email protected]>
AuthorDate: Wed Mar 8 06:48:23 2023 +0800

    GH-34322: [C++][Parquet] Encoding Microbench for ByteArray (#34323)
    
    
    
    ### Rationale for this change
    
    After https://github.com/apache/arrow/pull/14293 . We have 
`DELTA_BYTE_LENTH` for encoding ByteArray. So, I'd like to have encoding 
benchmark for them.
    
    ### What changes are included in this PR?
    
    Benchmark add some cases.
    
    ### Are these changes tested?
    
    No
    
    ### Are there any user-facing changes?
    
    No
    
    * Closes: #34322
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Will Jones <[email protected]>
---
 cpp/src/parquet/encoding_benchmark.cc | 150 ++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)

diff --git a/cpp/src/parquet/encoding_benchmark.cc 
b/cpp/src/parquet/encoding_benchmark.cc
index e6a3c2c58c..405549054e 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -569,6 +569,133 @@ 
BENCHMARK(BM_DeltaBitPackingDecode_Int64_Narrow)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_DeltaBitPackingDecode_Int32_Wide)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_DeltaBitPackingDecode_Int64_Wide)->Range(MIN_RANGE, MAX_RANGE);
 
+static void ByteArrayCustomArguments(benchmark::internal::Benchmark* b) {
+  b->ArgsProduct({{8, 64, 1024}, {512, 2048}})
+      ->ArgNames({"max-string-length", "batch-size"});
+}
+
+void EncodingByteArrayBenchmark(benchmark::State& state, Encoding::type 
encoding) {
+  ::arrow::random::RandomArrayGenerator rag(0);
+  // Using arrow generator to generate random data.
+  int32_t max_length = static_cast<int32_t>(state.range(0));
+  int32_t array_size = static_cast<int32_t>(state.range(1));
+  auto array =
+      rag.String(/* size */ array_size, /* min_length */ 0, /* max_length */ 
max_length,
+                 /* null_probability */ 0);
+  const auto array_actual =
+      ::arrow::internal::checked_pointer_cast<::arrow::StringArray>(array);
+  auto encoder = MakeTypedEncoder<ByteArrayType>(encoding);
+  std::vector<ByteArray> values;
+  for (int i = 0; i < array_actual->length(); ++i) {
+    values.emplace_back(array_actual->GetView(i));
+  }
+
+  for (auto _ : state) {
+    encoder->Put(values.data(), static_cast<int>(values.size()));
+    encoder->FlushValues();
+  }
+  state.SetItemsProcessed(state.iterations() * array_actual->length());
+  state.SetBytesProcessed(state.iterations() * 
(array_actual->value_data()->size() +
+                                                
array_actual->value_offsets()->size()));
+}
+
+static void BM_DeltaLengthEncodingByteArray(benchmark::State& state) {
+  EncodingByteArrayBenchmark(state, Encoding::DELTA_LENGTH_BYTE_ARRAY);
+}
+
+static void BM_PlainEncodingByteArray(benchmark::State& state) {
+  EncodingByteArrayBenchmark(state, Encoding::PLAIN);
+}
+
+void DecodingByteArrayBenchmark(benchmark::State& state, Encoding::type 
encoding) {
+  ::arrow::random::RandomArrayGenerator rag(0);
+  int32_t max_length = static_cast<int32_t>(state.range(0));
+  int32_t array_size = static_cast<int32_t>(state.range(1));
+  // Using arrow to write, because we just benchmark decoding here.
+  auto array =
+      rag.String(/* size */ array_size, /* min_length */ 0, /* max_length */ 
max_length,
+                 /* null_probability */ 0);
+  const auto array_actual =
+      ::arrow::internal::checked_pointer_cast<::arrow::StringArray>(array);
+  auto encoder = MakeTypedEncoder<ByteArrayType>(encoding);
+  encoder->Put(*array);
+  std::shared_ptr<Buffer> buf = encoder->FlushValues();
+
+  std::vector<ByteArray> values;
+  values.resize(array->length());
+  for (auto _ : state) {
+    auto decoder = MakeTypedDecoder<ByteArrayType>(encoding);
+    decoder->SetData(static_cast<int>(array->length()), buf->data(),
+                     static_cast<int>(buf->size()));
+    decoder->Decode(values.data(), static_cast<int>(values.size()));
+    ::benchmark::DoNotOptimize(values);
+  }
+  state.SetItemsProcessed(state.iterations() * array->length());
+  state.SetBytesProcessed(state.iterations() * 
(array_actual->value_data()->size() +
+                                                
array_actual->value_offsets()->size()));
+}
+
+static void BM_PlainDecodingByteArray(benchmark::State& state) {
+  DecodingByteArrayBenchmark(state, Encoding::PLAIN);
+}
+
+static void BM_DeltaLengthDecodingByteArray(benchmark::State& state) {
+  DecodingByteArrayBenchmark(state, Encoding::DELTA_LENGTH_BYTE_ARRAY);
+}
+
+BENCHMARK(BM_PlainEncodingByteArray)->Apply(ByteArrayCustomArguments);
+BENCHMARK(BM_DeltaLengthEncodingByteArray)->Apply(ByteArrayCustomArguments);
+BENCHMARK(BM_PlainDecodingByteArray)->Apply(ByteArrayCustomArguments);
+BENCHMARK(BM_DeltaLengthDecodingByteArray)->Apply(ByteArrayCustomArguments);
+
+static void BM_DecodingByteArraySpaced(benchmark::State& state, Encoding::type 
encoding) {
+  const double null_percent = 0.02;
+
+  auto rand = ::arrow::random::RandomArrayGenerator(0);
+  int32_t max_length = static_cast<int32_t>(state.range(0));
+  int32_t num_values = static_cast<int32_t>(state.range(1));
+  const auto array = rand.String(num_values, /* min_length */ 0,
+                                 /* max_length */ max_length, null_percent);
+  const auto valid_bits = array->null_bitmap_data();
+  const int null_count = static_cast<int>(array->null_count());
+  const auto array_actual =
+      ::arrow::internal::checked_pointer_cast<::arrow::StringArray>(array);
+
+  std::vector<ByteArray> byte_arrays;
+  byte_arrays.reserve(array_actual->length());
+  for (int i = 0; i < array_actual->length(); ++i) {
+    byte_arrays.emplace_back(array_actual->GetView(i));
+  }
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(encoding);
+  encoder->PutSpaced(byte_arrays.data(), num_values, valid_bits, 0);
+  std::shared_ptr<Buffer> buf = encoder->FlushValues();
+
+  auto decoder = MakeTypedDecoder<ByteArrayType>(encoding);
+  std::vector<uint8_t> decode_values(num_values * sizeof(ByteArray));
+  auto decode_buf = reinterpret_cast<ByteArray*>(decode_values.data());
+  for (auto _ : state) {
+    decoder->SetData(num_values - null_count, buf->data(), 
static_cast<int>(buf->size()));
+    decoder->DecodeSpaced(decode_buf, num_values, null_count, valid_bits, 0);
+    ::benchmark::DoNotOptimize(decode_buf);
+  }
+  state.counters["null_percent"] = null_percent * 100;
+  state.SetItemsProcessed(state.iterations() * array_actual->length());
+  state.SetBytesProcessed(state.iterations() * 
(array_actual->value_data()->size() +
+                                                
array_actual->value_offsets()->size()));
+}
+
+static void BM_PlainDecodingSpacedByteArray(benchmark::State& state) {
+  BM_DecodingByteArraySpaced(state, Encoding::PLAIN);
+}
+
+static void BM_DeltaLengthDecodingSpacedByteArray(benchmark::State& state) {
+  BM_DecodingByteArraySpaced(state, Encoding::DELTA_LENGTH_BYTE_ARRAY);
+}
+
+BENCHMARK(BM_PlainDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments);
+BENCHMARK(BM_DeltaLengthDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments);
+
 template <typename Type>
 static void DecodeDict(std::vector<typename Type::c_type>& values,
                        benchmark::State& state) {
@@ -634,6 +761,29 @@ static void 
BM_DictDecodingInt64_literals(benchmark::State& state) {
 
 BENCHMARK(BM_DictDecodingInt64_literals)->Range(MIN_RANGE, MAX_RANGE);
 
+static void BM_DictDecodingByteArray(benchmark::State& state) {
+  ::arrow::random::RandomArrayGenerator rag(0);
+  // Using arrow generator to generate random data.
+  int32_t max_length = static_cast<int32_t>(state.range(0));
+  int32_t array_size = static_cast<int32_t>(state.range(1));
+  auto array =
+      rag.String(/* size */ array_size, /* min_length */ 0, /* max_length */ 
max_length,
+                 /* null_probability */ 0);
+  const auto array_actual =
+      ::arrow::internal::checked_pointer_cast<::arrow::StringArray>(array);
+  auto encoder = MakeDictDecoder<ByteArrayType>();
+  std::vector<ByteArray> values;
+  for (int i = 0; i < array_actual->length(); ++i) {
+    values.emplace_back(array_actual->GetView(i));
+  }
+  DecodeDict<ByteArrayType>(values, state);
+  state.SetItemsProcessed(state.iterations() * array_actual->length());
+  state.SetBytesProcessed(state.iterations() * 
(array_actual->value_data()->size() +
+                                                
array_actual->value_offsets()->size()));
+}
+
+BENCHMARK(BM_DictDecodingByteArray)->Apply(ByteArrayCustomArguments);
+
 // ----------------------------------------------------------------------
 // Shared benchmarks for decoding using arrow builders
 

Reply via email to