pitrou commented on code in PR #40876:
URL: https://github.com/apache/arrow/pull/40876#discussion_r1549626605


##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -91,6 +91,27 @@ static void BM_PlainDecodingBoolean(benchmark::State& state) 
{
 
 BENCHMARK(BM_PlainDecodingBoolean)->Range(MIN_RANGE, MAX_RANGE);
 
+static void BM_PlainDecodingBooleanToBitmap(benchmark::State& state) {
+  std::vector<bool> values(state.range(0), true);
+  uint8_t* output = new 
uint8_t[::arrow::bit_util::BytesForBits(state.range(0))];
+  auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::PLAIN);
+  auto typed_encoder = dynamic_cast<BooleanEncoder*>(encoder.get());
+  typed_encoder->Put(values, static_cast<int>(values.size()));
+  std::shared_ptr<Buffer> buf = encoder->FlushValues();
+
+  for (auto _ : state) {
+    auto decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN);
+    decoder->SetData(static_cast<int>(values.size()), buf->data(),
+                     static_cast<int>(buf->size()));
+    decoder->Decode(output, static_cast<int>(values.size()));
+  }
+  // Still set `BytesProcessed` to byte level.
+  state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool));

Review Comment:
   This doesn't seem right. Also, can you add a `SetItemsProcessed` call here 
and above?



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -91,6 +91,27 @@ static void BM_PlainDecodingBoolean(benchmark::State& state) 
{
 
 BENCHMARK(BM_PlainDecodingBoolean)->Range(MIN_RANGE, MAX_RANGE);
 
+static void BM_PlainDecodingBooleanToBitmap(benchmark::State& state) {
+  std::vector<bool> values(state.range(0), true);
+  uint8_t* output = new 
uint8_t[::arrow::bit_util::BytesForBits(state.range(0))];

Review Comment:
   Can you use a `std::vector` instead?



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -1373,4 +1408,123 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, 
DecodeArrowNonNull_Dict)
 BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
+class BenchmarkDecodeArrowBoolean : public 
BenchmarkDecodeArrowBase<BooleanType> {
+ public:
+  void InitDataInputs() final {
+    // Generate a random string dictionary without any nulls so that this 
dataset can
+    // be used for benchmarking the DecodeArrowNonNull API
+    constexpr int repeat_factor = 8;
+    ::arrow::random::RandomArrayGenerator rag(0);
+    input_array_ =
+        rag.Boolean(num_values_, num_values_ / repeat_factor, 
null_probability_);
+    valid_bits_ = input_array_->null_bitmap_data();
+
+    // Arrow uses a bitmap representation for boolean arrays,
+    // so, we uses this as "total_size" for the benchmark.
+    total_size_ = ::arrow::bit_util::BytesForBits(num_values_);
+
+    values_.reserve(num_values_);
+    const auto& boolean_array = static_cast<const 
::arrow::BooleanArray&>(*input_array_);
+    for (int64_t i = 0; i < boolean_array.length(); i++) {
+      values_.push_back(boolean_array.Value(i));
+    }
+  }
+
+  typename EncodingTraits<BooleanType>::Accumulator CreateAccumulator() final {
+    return typename EncodingTraits<BooleanType>::Accumulator();
+  }
+
+  void DoEncodeLowLevel() final { ParquetException::NYI(); }
+
+  void DecodeArrowWithNullDenseBenchmark(benchmark::State& state);
+
+ protected:
+  double null_probability_ = 0.0;
+};
+
+void BenchmarkDecodeArrowBoolean::DecodeArrowWithNullDenseBenchmark(
+    benchmark::State& state) {
+  // Change null_probability
+  null_probability_ = static_cast<double>(state.range(1)) / 100;
+  InitDataInputs();
+  this->DoEncodeArrow();
+  int num_values_with_nulls = this->num_values_;
+
+  for (auto _ : state) {
+    auto decoder = this->InitializeDecoder();
+    auto acc = this->CreateAccumulator();
+    decoder->DecodeArrow(
+        num_values_with_nulls,
+        /*null_count=*/static_cast<int>(this->input_array_->null_count()),
+        this->valid_bits_, 0, &acc);
+  }
+  state.SetBytesProcessed(state.iterations() * 
static_cast<int64_t>(total_size_));
+}
+
+class BM_DecodeArrowBooleanPlain : public BenchmarkDecodeArrowBoolean {
+ public:
+  void DoEncodeArrow() final {
+    auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN);
+    encoder->Put(*input_array_);
+    buffer_ = encoder->FlushValues();
+  }
+
+  std::unique_ptr<TypedDecoder<BooleanType>> InitializeDecoder() override {
+    auto decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN);
+    decoder->SetData(num_values_, buffer_->data(), 
static_cast<int>(buffer_->size()));
+    return decoder;
+  }
+};
+
+class BM_DecodeArrowBooleanRle : public BenchmarkDecodeArrowBoolean {

Review Comment:
   You don't need this, you just need some member variable with 
`Encoding::PLAIN` or `Encoding::RLE`, right?



##########
cpp/src/parquet/encoding.cc:
##########
@@ -1188,19 +1191,52 @@ int PlainBooleanDecoder::DecodeArrow(
     typename EncodingTraits<BooleanType>::Accumulator* builder) {
   int values_decoded = num_values - null_count;
   if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
+    // A too large `num_values` was requested.
+    ParquetException::EofException();
+  }
+  if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(values_decoded))) {
     ParquetException::EofException();
   }
 
-  PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
-  VisitNullBitmapInline(
-      valid_bits, valid_bits_offset, num_values, null_count,
-      [&]() {
-        bool value;
-        ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value));
-        builder->UnsafeAppend(value);
-      },
-      [&]() { builder->UnsafeAppendNull(); });
+  if (null_count == 0) {
+    // FastPath: can copy the data directly
+    PARQUET_THROW_NOT_OK(builder->AppendValues(data_, values_decoded, NULLPTR,
+                                               total_num_values_ - 
num_values_));
+  } else {
+    // Handle nulls by BitBlockCounter
+    PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+    BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values);
+    int64_t value_position = 0;
+    int64_t valid_bits_offset_position = valid_bits_offset;
+    int64_t previous_value_offset = 0;
+    while (value_position < num_values) {
+      auto block = bit_counter.NextWord();
+      if (block.AllSet()) {
+        // Note: We don't have UnsafeAppendValues for booleans currently,
+        // so using `AppendValues` here.

Review Comment:
   Do you want to create a new issue for this?



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -1373,4 +1408,123 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, 
DecodeArrowNonNull_Dict)
 BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
+class BenchmarkDecodeArrowBoolean : public 
BenchmarkDecodeArrowBase<BooleanType> {
+ public:
+  void InitDataInputs() final {
+    // Generate a random string dictionary without any nulls so that this 
dataset can
+    // be used for benchmarking the DecodeArrowNonNull API
+    constexpr int repeat_factor = 8;
+    ::arrow::random::RandomArrayGenerator rag(0);
+    input_array_ =
+        rag.Boolean(num_values_, num_values_ / repeat_factor, 
null_probability_);
+    valid_bits_ = input_array_->null_bitmap_data();
+
+    // Arrow uses a bitmap representation for boolean arrays,
+    // so, we uses this as "total_size" for the benchmark.
+    total_size_ = ::arrow::bit_util::BytesForBits(num_values_);
+
+    values_.reserve(num_values_);
+    const auto& boolean_array = static_cast<const 
::arrow::BooleanArray&>(*input_array_);
+    for (int64_t i = 0; i < boolean_array.length(); i++) {
+      values_.push_back(boolean_array.Value(i));
+    }
+  }
+
+  typename EncodingTraits<BooleanType>::Accumulator CreateAccumulator() final {
+    return typename EncodingTraits<BooleanType>::Accumulator();
+  }
+
+  void DoEncodeLowLevel() final { ParquetException::NYI(); }
+
+  void DecodeArrowWithNullDenseBenchmark(benchmark::State& state);
+
+ protected:
+  double null_probability_ = 0.0;
+};
+
+void BenchmarkDecodeArrowBoolean::DecodeArrowWithNullDenseBenchmark(
+    benchmark::State& state) {
+  // Change null_probability
+  null_probability_ = static_cast<double>(state.range(1)) / 100;
+  InitDataInputs();
+  this->DoEncodeArrow();
+  int num_values_with_nulls = this->num_values_;
+
+  for (auto _ : state) {
+    auto decoder = this->InitializeDecoder();
+    auto acc = this->CreateAccumulator();
+    decoder->DecodeArrow(
+        num_values_with_nulls,
+        /*null_count=*/static_cast<int>(this->input_array_->null_count()),
+        this->valid_bits_, 0, &acc);
+  }
+  state.SetBytesProcessed(state.iterations() * 
static_cast<int64_t>(total_size_));
+}
+
+class BM_DecodeArrowBooleanPlain : public BenchmarkDecodeArrowBoolean {
+ public:
+  void DoEncodeArrow() final {
+    auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN);
+    encoder->Put(*input_array_);
+    buffer_ = encoder->FlushValues();
+  }
+
+  std::unique_ptr<TypedDecoder<BooleanType>> InitializeDecoder() override {
+    auto decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN);
+    decoder->SetData(num_values_, buffer_->data(), 
static_cast<int>(buffer_->size()));
+    return decoder;
+  }
+};
+
+class BM_DecodeArrowBooleanRle : public BenchmarkDecodeArrowBoolean {
+ public:
+  void DoEncodeArrow() final {
+    auto encoder = MakeTypedEncoder<BooleanType>(Encoding::RLE);
+    encoder->Put(*input_array_);
+    buffer_ = encoder->FlushValues();
+  }
+
+  std::unique_ptr<TypedDecoder<BooleanType>> InitializeDecoder() override {
+    auto decoder = MakeTypedDecoder<BooleanType>(Encoding::RLE);
+    decoder->SetData(num_values_, buffer_->data(), 
static_cast<int>(buffer_->size()));
+    return decoder;
+  }
+};
+
+static void BooleanWithNullCustomArguments(benchmark::internal::Benchmark* b) {
+  b->ArgsProduct({
+                     benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/2),

Review Comment:
   We don't need to test so many sizes, do we? 



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -1373,4 +1408,123 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, 
DecodeArrowNonNull_Dict)
 BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
+class BenchmarkDecodeArrowBoolean : public 
BenchmarkDecodeArrowBase<BooleanType> {
+ public:
+  void InitDataInputs() final {
+    // Generate a random string dictionary without any nulls so that this 
dataset can
+    // be used for benchmarking the DecodeArrowNonNull API
+    constexpr int repeat_factor = 8;
+    ::arrow::random::RandomArrayGenerator rag(0);
+    input_array_ =
+        rag.Boolean(num_values_, num_values_ / repeat_factor, 
null_probability_);

Review Comment:
   What is `repeat_factor` doing here?



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -1373,4 +1408,123 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, 
DecodeArrowNonNull_Dict)
 BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
+class BenchmarkDecodeArrowBoolean : public 
BenchmarkDecodeArrowBase<BooleanType> {
+ public:
+  void InitDataInputs() final {
+    // Generate a random string dictionary without any nulls so that this 
dataset can
+    // be used for benchmarking the DecodeArrowNonNull API
+    constexpr int repeat_factor = 8;
+    ::arrow::random::RandomArrayGenerator rag(0);
+    input_array_ =
+        rag.Boolean(num_values_, num_values_ / repeat_factor, 
null_probability_);
+    valid_bits_ = input_array_->null_bitmap_data();
+
+    // Arrow uses a bitmap representation for boolean arrays,
+    // so, we uses this as "total_size" for the benchmark.
+    total_size_ = ::arrow::bit_util::BytesForBits(num_values_);
+
+    values_.reserve(num_values_);
+    const auto& boolean_array = static_cast<const 
::arrow::BooleanArray&>(*input_array_);
+    for (int64_t i = 0; i < boolean_array.length(); i++) {
+      values_.push_back(boolean_array.Value(i));
+    }
+  }
+
+  typename EncodingTraits<BooleanType>::Accumulator CreateAccumulator() final {
+    return typename EncodingTraits<BooleanType>::Accumulator();
+  }
+
+  void DoEncodeLowLevel() final { ParquetException::NYI(); }
+
+  void DecodeArrowWithNullDenseBenchmark(benchmark::State& state);
+
+ protected:
+  double null_probability_ = 0.0;
+};
+
+void BenchmarkDecodeArrowBoolean::DecodeArrowWithNullDenseBenchmark(
+    benchmark::State& state) {
+  // Change null_probability
+  null_probability_ = static_cast<double>(state.range(1)) / 100;
+  InitDataInputs();
+  this->DoEncodeArrow();
+  int num_values_with_nulls = this->num_values_;
+
+  for (auto _ : state) {
+    auto decoder = this->InitializeDecoder();
+    auto acc = this->CreateAccumulator();
+    decoder->DecodeArrow(
+        num_values_with_nulls,
+        /*null_count=*/static_cast<int>(this->input_array_->null_count()),
+        this->valid_bits_, 0, &acc);
+  }
+  state.SetBytesProcessed(state.iterations() * 
static_cast<int64_t>(total_size_));
+}
+
+class BM_DecodeArrowBooleanPlain : public BenchmarkDecodeArrowBoolean {
+ public:
+  void DoEncodeArrow() final {
+    auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN);
+    encoder->Put(*input_array_);
+    buffer_ = encoder->FlushValues();
+  }
+
+  std::unique_ptr<TypedDecoder<BooleanType>> InitializeDecoder() override {
+    auto decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN);
+    decoder->SetData(num_values_, buffer_->data(), 
static_cast<int>(buffer_->size()));
+    return decoder;
+  }
+};
+
+class BM_DecodeArrowBooleanRle : public BenchmarkDecodeArrowBoolean {
+ public:
+  void DoEncodeArrow() final {
+    auto encoder = MakeTypedEncoder<BooleanType>(Encoding::RLE);
+    encoder->Put(*input_array_);
+    buffer_ = encoder->FlushValues();
+  }
+
+  std::unique_ptr<TypedDecoder<BooleanType>> InitializeDecoder() override {
+    auto decoder = MakeTypedDecoder<BooleanType>(Encoding::RLE);
+    decoder->SetData(num_values_, buffer_->data(), 
static_cast<int>(buffer_->size()));
+    return decoder;
+  }
+};
+
+static void BooleanWithNullCustomArguments(benchmark::internal::Benchmark* b) {
+  b->ArgsProduct({
+                     benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/2),
+                     {10, 50},
+                 })
+      ->ArgNames({"num-values", "null-prob"});
+}
+
+BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, 
DecodeArrow_Dense)(benchmark::State& state) {
+  DecodeArrowDenseBenchmark(state);
+}
+BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrow_Dense)
+    ->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull_Dense)
+(benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); }
+BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull_Dense)
+    ->Range(MIN_RANGE, MAX_RANGE);
+// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet.
+// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull_Dense)
+//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); }
+// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull_Dense)
+//    ->Apply(BooleanWithNullCustomArguments);
+
+BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow_Dense)
+(benchmark::State& state) { DecodeArrowDenseBenchmark(state); }
+BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrow_Dense)
+    ->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrowNonNull_Dense)
+(benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); }
+BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrowNonNull_Dense)
+    ->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrowWithNull_Dense)
+(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); }
+BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanPlain, DecodeArrowWithNull_Dense)
+    ->Apply(BooleanWithNullCustomArguments);

Review Comment:
   Let's remove "Dense" here?



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -1373,4 +1408,123 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, 
DecodeArrowNonNull_Dict)
 BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
+class BenchmarkDecodeArrowBoolean : public 
BenchmarkDecodeArrowBase<BooleanType> {
+ public:
+  void InitDataInputs() final {
+    // Generate a random string dictionary without any nulls so that this 
dataset can

Review Comment:
   Can you update this comment?



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -1373,4 +1408,123 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, 
DecodeArrowNonNull_Dict)
 BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
+class BenchmarkDecodeArrowBoolean : public 
BenchmarkDecodeArrowBase<BooleanType> {
+ public:
+  void InitDataInputs() final {
+    // Generate a random string dictionary without any nulls so that this 
dataset can
+    // be used for benchmarking the DecodeArrowNonNull API
+    constexpr int repeat_factor = 8;
+    ::arrow::random::RandomArrayGenerator rag(0);
+    input_array_ =
+        rag.Boolean(num_values_, num_values_ / repeat_factor, 
null_probability_);
+    valid_bits_ = input_array_->null_bitmap_data();
+
+    // Arrow uses a bitmap representation for boolean arrays,
+    // so, we uses this as "total_size" for the benchmark.
+    total_size_ = ::arrow::bit_util::BytesForBits(num_values_);
+
+    values_.reserve(num_values_);
+    const auto& boolean_array = static_cast<const 
::arrow::BooleanArray&>(*input_array_);
+    for (int64_t i = 0; i < boolean_array.length(); i++) {
+      values_.push_back(boolean_array.Value(i));
+    }
+  }
+
+  typename EncodingTraits<BooleanType>::Accumulator CreateAccumulator() final {
+    return typename EncodingTraits<BooleanType>::Accumulator();
+  }
+
+  void DoEncodeLowLevel() final { ParquetException::NYI(); }
+
+  void DecodeArrowWithNullDenseBenchmark(benchmark::State& state);
+
+ protected:
+  double null_probability_ = 0.0;
+};
+
+void BenchmarkDecodeArrowBoolean::DecodeArrowWithNullDenseBenchmark(

Review Comment:
   Let's remove "Dense" because that doesn't match the semantics.



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -1373,4 +1408,123 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, 
DecodeArrowNonNull_Dict)
 BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
+class BenchmarkDecodeArrowBoolean : public 
BenchmarkDecodeArrowBase<BooleanType> {
+ public:
+  void InitDataInputs() final {
+    // Generate a random string dictionary without any nulls so that this 
dataset can
+    // be used for benchmarking the DecodeArrowNonNull API
+    constexpr int repeat_factor = 8;
+    ::arrow::random::RandomArrayGenerator rag(0);
+    input_array_ =
+        rag.Boolean(num_values_, num_values_ / repeat_factor, 
null_probability_);
+    valid_bits_ = input_array_->null_bitmap_data();
+
+    // Arrow uses a bitmap representation for boolean arrays,
+    // so, we uses this as "total_size" for the benchmark.
+    total_size_ = ::arrow::bit_util::BytesForBits(num_values_);
+
+    values_.reserve(num_values_);
+    const auto& boolean_array = static_cast<const 
::arrow::BooleanArray&>(*input_array_);
+    for (int64_t i = 0; i < boolean_array.length(); i++) {
+      values_.push_back(boolean_array.Value(i));
+    }
+  }
+
+  typename EncodingTraits<BooleanType>::Accumulator CreateAccumulator() final {
+    return typename EncodingTraits<BooleanType>::Accumulator();
+  }
+
+  void DoEncodeLowLevel() final { ParquetException::NYI(); }
+
+  void DecodeArrowWithNullDenseBenchmark(benchmark::State& state);
+
+ protected:
+  double null_probability_ = 0.0;
+};
+
+void BenchmarkDecodeArrowBoolean::DecodeArrowWithNullDenseBenchmark(
+    benchmark::State& state) {
+  // Change null_probability
+  null_probability_ = static_cast<double>(state.range(1)) / 100;
+  InitDataInputs();
+  this->DoEncodeArrow();
+  int num_values_with_nulls = this->num_values_;
+
+  for (auto _ : state) {
+    auto decoder = this->InitializeDecoder();
+    auto acc = this->CreateAccumulator();
+    decoder->DecodeArrow(
+        num_values_with_nulls,
+        /*null_count=*/static_cast<int>(this->input_array_->null_count()),
+        this->valid_bits_, 0, &acc);
+  }
+  state.SetBytesProcessed(state.iterations() * 
static_cast<int64_t>(total_size_));
+}
+
+class BM_DecodeArrowBooleanPlain : public BenchmarkDecodeArrowBoolean {
+ public:
+  void DoEncodeArrow() final {
+    auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN);
+    encoder->Put(*input_array_);
+    buffer_ = encoder->FlushValues();
+  }
+
+  std::unique_ptr<TypedDecoder<BooleanType>> InitializeDecoder() override {
+    auto decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN);
+    decoder->SetData(num_values_, buffer_->data(), 
static_cast<int>(buffer_->size()));
+    return decoder;
+  }
+};
+
+class BM_DecodeArrowBooleanRle : public BenchmarkDecodeArrowBoolean {
+ public:
+  void DoEncodeArrow() final {
+    auto encoder = MakeTypedEncoder<BooleanType>(Encoding::RLE);
+    encoder->Put(*input_array_);
+    buffer_ = encoder->FlushValues();
+  }
+
+  std::unique_ptr<TypedDecoder<BooleanType>> InitializeDecoder() override {
+    auto decoder = MakeTypedDecoder<BooleanType>(Encoding::RLE);
+    decoder->SetData(num_values_, buffer_->data(), 
static_cast<int>(buffer_->size()));
+    return decoder;
+  }
+};
+
+static void BooleanWithNullCustomArguments(benchmark::internal::Benchmark* b) {
+  b->ArgsProduct({
+                     benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/2),
+                     {10, 50},

Review Comment:
   Can you instead reuse the same null percentage convention as in 
`BM_PlainDecodingSpacedBoolean`?



##########
cpp/src/parquet/encoding_benchmark.cc:
##########
@@ -1373,4 +1408,123 @@ BENCHMARK_DEFINE_F(BM_ArrowBinaryDict, 
DecodeArrowNonNull_Dict)
 BENCHMARK_REGISTER_F(BM_ArrowBinaryDict, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
+class BenchmarkDecodeArrowBoolean : public 
BenchmarkDecodeArrowBase<BooleanType> {
+ public:
+  void InitDataInputs() final {
+    // Generate a random string dictionary without any nulls so that this 
dataset can
+    // be used for benchmarking the DecodeArrowNonNull API
+    constexpr int repeat_factor = 8;
+    ::arrow::random::RandomArrayGenerator rag(0);
+    input_array_ =
+        rag.Boolean(num_values_, num_values_ / repeat_factor, 
null_probability_);
+    valid_bits_ = input_array_->null_bitmap_data();
+
+    // Arrow uses a bitmap representation for boolean arrays,
+    // so, we uses this as "total_size" for the benchmark.
+    total_size_ = ::arrow::bit_util::BytesForBits(num_values_);
+
+    values_.reserve(num_values_);
+    const auto& boolean_array = static_cast<const 
::arrow::BooleanArray&>(*input_array_);
+    for (int64_t i = 0; i < boolean_array.length(); i++) {
+      values_.push_back(boolean_array.Value(i));
+    }
+  }
+
+  typename EncodingTraits<BooleanType>::Accumulator CreateAccumulator() final {
+    return typename EncodingTraits<BooleanType>::Accumulator();
+  }
+
+  void DoEncodeLowLevel() final { ParquetException::NYI(); }
+
+  void DecodeArrowWithNullDenseBenchmark(benchmark::State& state);
+
+ protected:
+  double null_probability_ = 0.0;
+};
+
+void BenchmarkDecodeArrowBoolean::DecodeArrowWithNullDenseBenchmark(
+    benchmark::State& state) {
+  // Change null_probability
+  null_probability_ = static_cast<double>(state.range(1)) / 100;
+  InitDataInputs();
+  this->DoEncodeArrow();
+  int num_values_with_nulls = this->num_values_;
+
+  for (auto _ : state) {
+    auto decoder = this->InitializeDecoder();
+    auto acc = this->CreateAccumulator();
+    decoder->DecodeArrow(
+        num_values_with_nulls,
+        /*null_count=*/static_cast<int>(this->input_array_->null_count()),
+        this->valid_bits_, 0, &acc);
+  }
+  state.SetBytesProcessed(state.iterations() * 
static_cast<int64_t>(total_size_));

Review Comment:
   Can you also call `SetItemsProcessed`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to