pitrou commented on a change in pull request #7175:
URL: https://github.com/apache/arrow/pull/7175#discussion_r425110880



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -118,13 +134,22 @@ std::shared_ptr<::arrow::Table> TableFromVector(
 
 template <>
 std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const 
std::vector<bool>& vec,
-                                                             bool nullable) {
+                                                             bool nullable,
+                                                             int 
null_percentage) {
   BooleanBuilder builder;
   if (nullable) {
     std::vector<bool> valid_bytes(BENCHMARK_SIZE, 0);
-    int n = {0};
-    std::generate(valid_bytes.begin(), valid_bytes.end(),
-                  [&n] { return (n++ % 2) != 0; });
+    if (null_percentage == -1) {

Review comment:
       Perhaps nulls generation can be factored out?

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -95,15 +97,29 @@ void SetBytesProcessed(::benchmark::State& state) {
   state.SetBytesProcessed(bytes_processed);
 }
 
+constexpr int64_t kAlternatingOrNa = -1;
+
 template <typename ParquetType>
 std::shared_ptr<::arrow::Table> TableFromVector(
-    const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
+    const std::vector<typename ParquetType::c_type>& vec, bool nullable,
+    int null_percentage = kAlternatingOrNa) {
+  if (!nullable) {
+    DCHECK(null_percentage = kAlternatingOrNa);
+  }
   std::shared_ptr<::arrow::DataType> type = 
std::make_shared<ArrowType<ParquetType>>();
   NumericBuilder<ArrowType<ParquetType>> builder;
   if (nullable) {
     std::vector<uint8_t> valid_bytes(BENCHMARK_SIZE, 0);
-    int n = {0};
-    std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 
2; });
+    if (null_percentage == -1) {

Review comment:
       Should this be `kAlternatingOrNa`?

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -187,17 +219,56 @@ static void BM_ReadColumn(::benchmark::State& state) {
   SetBytesProcessed<nullable, ParquetType>(state);
 }
 
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 
/*first_value_percentage=*/0})

Review comment:
       Can you add a comment why we're varying `first_value_percentage`? 
(trigger RLE perhaps?)

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -187,17 +219,56 @@ static void BM_ReadColumn(::benchmark::State& state) {
   SetBytesProcessed<nullable, ParquetType>(state);
 }
 
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 
/*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/100})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 
/*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/5, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/30, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/35, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/45, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/1})
+    ->Args({/*null_percentage=*/75, /*first_value_percentage=*/1})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType)
+    ->Args({kAlternatingOrNa, 0})
+    ->Args({1, 20});

Review comment:
       Why `1`?

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -95,15 +97,29 @@ void SetBytesProcessed(::benchmark::State& state) {
   state.SetBytesProcessed(bytes_processed);
 }
 
+constexpr int64_t kAlternatingOrNa = -1;
+
 template <typename ParquetType>
 std::shared_ptr<::arrow::Table> TableFromVector(
-    const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
+    const std::vector<typename ParquetType::c_type>& vec, bool nullable,
+    int null_percentage = kAlternatingOrNa) {

Review comment:
       `int64_t` above

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -187,17 +219,56 @@ static void BM_ReadColumn(::benchmark::State& state) {
   SetBytesProcessed<nullable, ParquetType>(state);
 }
 
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 
/*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/100})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 
/*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/5, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/30, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/35, /*first_value_percentage=*/25})

Review comment:
       Do we need such a granularity in `null_percentage` values?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to