This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b1480a6  ARROW-10120: [C++] Add two-level nested Parquet read to Arrow 
benchmarks
b1480a6 is described below

commit b1480a645042aa32b51e1709d0d016a83d273074
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 6 19:15:47 2020 +0200

    ARROW-10120: [C++] Add two-level nested Parquet read to Arrow benchmarks
    
    Closes #8342 from pitrou/ARROW-10120-nested-pq-benchmarks
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/testing/random.cc                  |  33 ++++-
 cpp/src/arrow/testing/random.h                   |  17 ++-
 cpp/src/parquet/arrow/reader_writer_benchmark.cc | 181 ++++++++++++++++++-----
 3 files changed, 191 insertions(+), 40 deletions(-)

diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index d32e1d5..32007f8 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -31,6 +31,7 @@
 #include "arrow/type_fwd.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
@@ -268,18 +269,21 @@ std::shared_ptr<Array> 
RandomArrayGenerator::StringWithRepeats(int64_t size,
 
 std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t 
first_offset,
                                                      int32_t last_offset,
-                                                     double null_probability) {
+                                                     double null_probability,
+                                                     bool force_empty_nulls) {
   using GenOpt = GenerateOptions<int32_t, 
std::uniform_int_distribution<int32_t>>;
   GenOpt options(seed(), first_offset, last_offset, null_probability);
 
   BufferVector buffers{2};
 
   int64_t null_count = 0;
+
   buffers[0] = *AllocateEmptyBitmap(size);
-  options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+  uint8_t* null_bitmap = buffers[0]->mutable_data();
+  options.GenerateBitmap(null_bitmap, size, &null_count);
   // Make sure the first and last entry are non-null
-  arrow::BitUtil::SetBit(buffers[0]->mutable_data(), 0);
-  arrow::BitUtil::SetBit(buffers[0]->mutable_data(), size - 1);
+  arrow::BitUtil::SetBit(null_bitmap, 0);
+  arrow::BitUtil::SetBit(null_bitmap, size - 1);
 
   buffers[1] = *AllocateBuffer(sizeof(int32_t) * size);
   auto data = reinterpret_cast<int32_t*>(buffers[1]->mutable_data());
@@ -292,10 +296,31 @@ std::shared_ptr<Array> 
RandomArrayGenerator::Offsets(int64_t size, int32_t first
   data[0] = first_offset;
   data[size - 1] = last_offset;
 
+  if (force_empty_nulls) {
+    arrow::internal::BitmapReader reader(null_bitmap, 0, size);
+    for (int64_t i = 0; i < size; ++i) {
+      if (reader.IsNotSet()) {
+        // Ensure a null entry corresponds to a 0-sized list extent
+        // (note this can be neither the first nor the last list entry, see 
above)
+        data[i + 1] = data[i];
+      }
+      reader.Next();
+    }
+  }
+
   auto array_data = ArrayData::Make(int32(), size, buffers, null_count);
   return std::make_shared<Int32Array>(array_data);
 }
 
+std::shared_ptr<Array> RandomArrayGenerator::List(const Array& values, int64_t 
size,
+                                                  double null_probability,
+                                                  bool force_empty_nulls) {
+  auto offsets = Offsets(size, static_cast<int32_t>(values.offset()),
+                         static_cast<int32_t>(values.offset() + 
values.length()),
+                         null_probability, force_empty_nulls);
+  return *::arrow::ListArray::FromArrays(*offsets, values);
+}
+
 namespace {
 
 struct RandomArrayGeneratorOfImpl {
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index 1fb6563..6f04d31 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -229,10 +229,12 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
   /// \param[in] first_offset the first offset value (usually 0)
   /// \param[in] last_offset the last offset value (usually the size of the 
child array)
   /// \param[in] null_probability the probability of an offset being null
+  /// \param[in] force_empty_nulls if true, null offsets must have 0 "length"
   ///
   /// \return a generated Array
   std::shared_ptr<Array> Offsets(int64_t size, int32_t first_offset, int32_t 
last_offset,
-                                 double null_probability = 0);
+                                 double null_probability = 0,
+                                 bool force_empty_nulls = false);
 
   /// \brief Generate a random StringArray
   ///
@@ -281,7 +283,18 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
                                            int32_t min_length, int32_t 
max_length,
                                            double null_probability = 0);
 
-  /// \brief Randomly generate an Array of the specified type, size, and 
null_probability.
+  /// \brief Generate a random ListArray
+  ///
+  /// \param[in] values The underlying values array
+  /// \param[in] size The size of the generated list array
+  /// \param[in] null_probability the probability of a list value being null
+  /// \param[in] force_empty_nulls if true, null list entries must have 0 
length
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> List(const Array& values, int64_t size, double 
null_probability,
+                              bool force_empty_nulls = false);
+
+  /// \brief Generate a random Array of the specified type, size, and 
null_probability.
   ///
   /// Generation parameters other than size and null_probability are 
determined based on
   /// the type of Array to be generated.
diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc 
b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
index 3bb1e75..134cedc 100644
--- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc
+++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
@@ -30,9 +30,13 @@
 
 #include "arrow/api.h"
 #include "arrow/testing/random.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/logging.h"
 
+using arrow::Array;
+using arrow::ArrayVector;
 using arrow::BooleanBuilder;
+using arrow::FieldVector;
 using arrow::NumericBuilder;
 
 #define EXIT_NOT_OK(s)                                        \
@@ -223,6 +227,17 @@ static void BenchmarkReadTable(::benchmark::State& state, 
const ::arrow::Table&
   }
 }
 
+static void BenchmarkReadArray(::benchmark::State& state,
+                               const std::shared_ptr<Array>& array, bool 
nullable,
+                               int64_t num_values = -1, int64_t 
bytes_per_value = -1) {
+  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+  EXIT_NOT_OK(table->Validate());
+
+  BenchmarkReadTable(state, *table, num_values, bytes_per_value);
+}
+
 //
 // Benchmark reading a primitive column
 //
@@ -302,6 +317,54 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
 // Benchmark reading a nested column
 //
 
+const std::vector<int64_t> kNestedNullPercents = {0, 1, 50, 99};
+
+// XXX We can use ArgsProduct() starting from Benchmark 1.5.2
+static void NestedReadArguments(::benchmark::internal::Benchmark* b) {
+  for (const auto null_percentage : kNestedNullPercents) {
+    b->Arg(null_percentage);
+  }
+}
+
+static std::shared_ptr<Array> 
MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
+                                              const ArrayVector& children,
+                                              double null_probability,
+                                              bool propagate_validity = false) 
{
+  ARROW_CHECK_GT(children.size(), 0);
+  const int64_t length = children[0]->length();
+
+  std::shared_ptr<::arrow::Buffer> null_bitmap;
+  if (null_probability > 0.0) {
+    null_bitmap = rng->NullBitmap(length, null_probability);
+    if (propagate_validity) {
+      // HACK: the Parquet writer currently doesn't allow non-empty list
+      // entries where a parent node is null (for instance, a struct-of-list
+      // where the outer struct is marked null but the inner list value is
+      // non-empty).
+      for (const auto& child : children) {
+        null_bitmap = *::arrow::internal::BitmapOr(
+            ::arrow::default_memory_pool(), null_bitmap->data(), 0,
+            child->null_bitmap_data(), 0, length, 0);
+      }
+    }
+  }
+  FieldVector fields(children.size());
+  char field_name = 'a';
+  for (size_t i = 0; i < children.size(); ++i) {
+    fields[i] = field(std::string{field_name++}, children[i]->type(),
+                      /*nullable=*/null_probability > 0.0);
+  }
+  return *::arrow::StructArray::Make(children, std::move(fields), null_bitmap);
+}
+
+// Make a (int32, int64) struct array
+static std::shared_ptr<Array> 
MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
+                                              int64_t size, double 
null_probability) {
+  auto values1 = rng->Int32(size, -5, 5, null_probability);
+  auto values2 = rng->Int64(size, -12345678912345LL, 12345678912345LL, 
null_probability);
+  return MakeStructArray(rng, {values1, values2}, null_probability);
+}
+
 static void BM_ReadStructColumn(::benchmark::State& state) {
   constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
   const double null_probability = static_cast<double>(state.range(0)) / 100.0;
@@ -309,38 +372,79 @@ static void BM_ReadStructColumn(::benchmark::State& 
state) {
 
   ARROW_CHECK_GE(null_probability, 0.0);
 
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+  auto array = MakeStructArray(&rng, kNumValues, null_probability);
+
+  BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments);
+
+static void BM_ReadStructOfStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  const int64_t kBytesPerValue = 2 * (sizeof(int32_t) + sizeof(int64_t));
+
   ::arrow::random::RandomArrayGenerator rng(42);
+  auto values1 = MakeStructArray(&rng, kNumValues, null_probability);
+  auto values2 = MakeStructArray(&rng, kNumValues, null_probability);
+  auto array = MakeStructArray(&rng, {values1, values2}, null_probability);
+
+  BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments);
+
+static void BM_ReadStructOfListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
 
   auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
   auto values2 =
       rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, 
null_probability);
+  auto list1 = rng.List(*values1, kNumValues / 10, null_probability);
+  auto list2 = rng.List(*values2, kNumValues / 10, null_probability);
+  auto array = MakeStructArray(&rng, {list1, list2}, null_probability,
+                               /*propagate_validity =*/true);
 
-  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+  BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
+}
 
-  std::shared_ptr<::arrow::Buffer> null_bitmap;
-  if (nullable) {
-    null_bitmap = rng.NullBitmap(kNumValues, null_probability);
-  }
-  auto array = *::arrow::StructArray::Make(
-      {values1, values2},
-      ::arrow::FieldVector{field("a", values1->type(), nullable),
-                           field("b", values2->type(), nullable)},
-      null_bitmap);
-  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
-  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments);
 
-  EXIT_NOT_OK(table->Validate());
+static void BM_ReadListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
+  const int64_t kBytesPerValue = sizeof(int64_t);
+
+  auto array = rng.List(*values, kNumValues / 10, null_probability);
 
-  BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+  BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
 }
 
-BENCHMARK(BM_ReadStructColumn)
-    ->Arg(/*null_percentage=*/0)
-    ->Arg(/*null_percentage=*/1)
-    ->Arg(/*null_percentage=*/50)
-    ->Arg(/*null_percentage=*/99);
+BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments);
 
-static void BM_ReadListColumn(::benchmark::State& state) {
+static void BM_ReadListOfStructColumn(::benchmark::State& state) {
   constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
   const double null_probability = static_cast<double>(state.range(0)) / 100.0;
   const bool nullable = (null_probability != 0.0);
@@ -349,26 +453,35 @@ static void BM_ReadListColumn(::benchmark::State& state) {
 
   ::arrow::random::RandomArrayGenerator rng(42);
 
-  auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
-  auto offsets = rng.Offsets(kNumValues / 10, 0, 
static_cast<int32_t>(values->length()),
-                             null_probability);
+  auto values = MakeStructArray(&rng, kNumValues, null_probability);
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
 
-  const int64_t kBytesPerValue = sizeof(int64_t);
+  auto array = rng.List(*values, kNumValues / 10, null_probability);
 
-  auto array = *::arrow::ListArray::FromArrays(*offsets, *values);
-  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
-  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+  BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
+}
 
-  EXIT_NOT_OK(table->Validate());
+BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments);
+
+static void BM_ReadListOfListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
+  const int64_t kBytesPerValue = sizeof(int64_t);
+
+  auto inner = rng.List(*values, kNumValues / 10, null_probability);
+  auto array = rng.List(*inner, kNumValues / 100, null_probability);
 
-  BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+  BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
 }
 
-BENCHMARK(BM_ReadListColumn)
-    ->Arg(/*null_percentage=*/0)
-    ->Arg(/*null_percentage=*/1)
-    ->Arg(/*null_percentage=*/50)
-    ->Arg(/*null_percentage=*/99);
+BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments);
 
 //
 // Benchmark different ways of reading select row groups

Reply via email to