This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b1480a6 ARROW-10120: [C++] Add two-level nested Parquet read to Arrow
benchmarks
b1480a6 is described below
commit b1480a645042aa32b51e1709d0d016a83d273074
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 6 19:15:47 2020 +0200
ARROW-10120: [C++] Add two-level nested Parquet read to Arrow benchmarks
Closes #8342 from pitrou/ARROW-10120-nested-pq-benchmarks
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/testing/random.cc | 33 ++++-
cpp/src/arrow/testing/random.h | 17 ++-
cpp/src/parquet/arrow/reader_writer_benchmark.cc | 181 ++++++++++++++++++-----
3 files changed, 191 insertions(+), 40 deletions(-)
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index d32e1d5..32007f8 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -31,6 +31,7 @@
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
#include "arrow/util/logging.h"
namespace arrow {
@@ -268,18 +269,21 @@ std::shared_ptr<Array>
RandomArrayGenerator::StringWithRepeats(int64_t size,
std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t
first_offset,
int32_t last_offset,
- double null_probability) {
+ double null_probability,
+ bool force_empty_nulls) {
using GenOpt = GenerateOptions<int32_t,
std::uniform_int_distribution<int32_t>>;
GenOpt options(seed(), first_offset, last_offset, null_probability);
BufferVector buffers{2};
int64_t null_count = 0;
+
buffers[0] = *AllocateEmptyBitmap(size);
- options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+ uint8_t* null_bitmap = buffers[0]->mutable_data();
+ options.GenerateBitmap(null_bitmap, size, &null_count);
// Make sure the first and last entry are non-null
- arrow::BitUtil::SetBit(buffers[0]->mutable_data(), 0);
- arrow::BitUtil::SetBit(buffers[0]->mutable_data(), size - 1);
+ arrow::BitUtil::SetBit(null_bitmap, 0);
+ arrow::BitUtil::SetBit(null_bitmap, size - 1);
buffers[1] = *AllocateBuffer(sizeof(int32_t) * size);
auto data = reinterpret_cast<int32_t*>(buffers[1]->mutable_data());
@@ -292,10 +296,31 @@ std::shared_ptr<Array>
RandomArrayGenerator::Offsets(int64_t size, int32_t first
data[0] = first_offset;
data[size - 1] = last_offset;
+ if (force_empty_nulls) {
+ arrow::internal::BitmapReader reader(null_bitmap, 0, size);
+ for (int64_t i = 0; i < size; ++i) {
+ if (reader.IsNotSet()) {
+ // Ensure a null entry corresponds to a 0-sized list extent
+ // (note this can be neither the first nor the last list entry, see
above)
+ data[i + 1] = data[i];
+ }
+ reader.Next();
+ }
+ }
+
auto array_data = ArrayData::Make(int32(), size, buffers, null_count);
return std::make_shared<Int32Array>(array_data);
}
+std::shared_ptr<Array> RandomArrayGenerator::List(const Array& values, int64_t
size,
+ double null_probability,
+ bool force_empty_nulls) {
+ auto offsets = Offsets(size, static_cast<int32_t>(values.offset()),
+ static_cast<int32_t>(values.offset() +
values.length()),
+ null_probability, force_empty_nulls);
+ return *::arrow::ListArray::FromArrays(*offsets, values);
+}
+
namespace {
struct RandomArrayGeneratorOfImpl {
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index 1fb6563..6f04d31 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -229,10 +229,12 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] first_offset the first offset value (usually 0)
/// \param[in] last_offset the last offset value (usually the size of the
child array)
/// \param[in] null_probability the probability of an offset being null
+ /// \param[in] force_empty_nulls if true, null offsets must have 0 "length"
///
/// \return a generated Array
std::shared_ptr<Array> Offsets(int64_t size, int32_t first_offset, int32_t
last_offset,
- double null_probability = 0);
+ double null_probability = 0,
+ bool force_empty_nulls = false);
/// \brief Generate a random StringArray
///
@@ -281,7 +283,18 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
int32_t min_length, int32_t
max_length,
double null_probability = 0);
- /// \brief Randomly generate an Array of the specified type, size, and
null_probability.
+ /// \brief Generate a random ListArray
+ ///
+ /// \param[in] values The underlying values array
+ /// \param[in] size The size of the generated list array
+ /// \param[in] null_probability the probability of a list value being null
+ /// \param[in] force_empty_nulls if true, null list entries must have 0
length
+ ///
+ /// \return a generated Array
+ std::shared_ptr<Array> List(const Array& values, int64_t size, double
null_probability,
+ bool force_empty_nulls = false);
+
+ /// \brief Generate a random Array of the specified type, size, and
null_probability.
///
/// Generation parameters other than size and null_probability are
determined based on
/// the type of Array to be generated.
diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc
b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
index 3bb1e75..134cedc 100644
--- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc
+++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
@@ -30,9 +30,13 @@
#include "arrow/api.h"
#include "arrow/testing/random.h"
+#include "arrow/util/bitmap_ops.h"
#include "arrow/util/logging.h"
+using arrow::Array;
+using arrow::ArrayVector;
using arrow::BooleanBuilder;
+using arrow::FieldVector;
using arrow::NumericBuilder;
#define EXIT_NOT_OK(s) \
@@ -223,6 +227,17 @@ static void BenchmarkReadTable(::benchmark::State& state,
const ::arrow::Table&
}
}
+static void BenchmarkReadArray(::benchmark::State& state,
+ const std::shared_ptr<Array>& array, bool
nullable,
+ int64_t num_values = -1, int64_t
bytes_per_value = -1) {
+ auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+ auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+ EXIT_NOT_OK(table->Validate());
+
+ BenchmarkReadTable(state, *table, num_values, bytes_per_value);
+}
+
//
// Benchmark reading a primitive column
//
@@ -302,6 +317,54 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
// Benchmark reading a nested column
//
+const std::vector<int64_t> kNestedNullPercents = {0, 1, 50, 99};
+
+// XXX We can use ArgsProduct() starting from Benchmark 1.5.2
+static void NestedReadArguments(::benchmark::internal::Benchmark* b) {
+ for (const auto null_percentage : kNestedNullPercents) {
+ b->Arg(null_percentage);
+ }
+}
+
+static std::shared_ptr<Array>
MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
+ const ArrayVector& children,
+ double null_probability,
+ bool propagate_validity = false)
{
+ ARROW_CHECK_GT(children.size(), 0);
+ const int64_t length = children[0]->length();
+
+ std::shared_ptr<::arrow::Buffer> null_bitmap;
+ if (null_probability > 0.0) {
+ null_bitmap = rng->NullBitmap(length, null_probability);
+ if (propagate_validity) {
+ // HACK: the Parquet writer currently doesn't allow non-empty list
+ // entries where a parent node is null (for instance, a struct-of-list
+ // where the outer struct is marked null but the inner list value is
+ // non-empty).
+ for (const auto& child : children) {
+ null_bitmap = *::arrow::internal::BitmapOr(
+ ::arrow::default_memory_pool(), null_bitmap->data(), 0,
+ child->null_bitmap_data(), 0, length, 0);
+ }
+ }
+ }
+ FieldVector fields(children.size());
+ char field_name = 'a';
+ for (size_t i = 0; i < children.size(); ++i) {
+ fields[i] = field(std::string{field_name++}, children[i]->type(),
+ /*nullable=*/null_probability > 0.0);
+ }
+ return *::arrow::StructArray::Make(children, std::move(fields), null_bitmap);
+}
+
+// Make a (int32, int64) struct array
+static std::shared_ptr<Array>
MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
+ int64_t size, double
null_probability) {
+ auto values1 = rng->Int32(size, -5, 5, null_probability);
+ auto values2 = rng->Int64(size, -12345678912345LL, 12345678912345LL,
null_probability);
+ return MakeStructArray(rng, {values1, values2}, null_probability);
+}
+
static void BM_ReadStructColumn(::benchmark::State& state) {
constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
const double null_probability = static_cast<double>(state.range(0)) / 100.0;
@@ -309,38 +372,79 @@ static void BM_ReadStructColumn(::benchmark::State&
state) {
ARROW_CHECK_GE(null_probability, 0.0);
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+ auto array = MakeStructArray(&rng, kNumValues, null_probability);
+
+ BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments);
+
+static void BM_ReadStructOfStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ const int64_t kBytesPerValue = 2 * (sizeof(int32_t) + sizeof(int64_t));
+
::arrow::random::RandomArrayGenerator rng(42);
+ auto values1 = MakeStructArray(&rng, kNumValues, null_probability);
+ auto values2 = MakeStructArray(&rng, kNumValues, null_probability);
+ auto array = MakeStructArray(&rng, {values1, values2}, null_probability);
+
+ BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments);
+
+static void BM_ReadStructOfListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
auto values2 =
rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL,
null_probability);
+ auto list1 = rng.List(*values1, kNumValues / 10, null_probability);
+ auto list2 = rng.List(*values2, kNumValues / 10, null_probability);
+ auto array = MakeStructArray(&rng, {list1, list2}, null_probability,
+ /*propagate_validity =*/true);
- const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+ BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
+}
- std::shared_ptr<::arrow::Buffer> null_bitmap;
- if (nullable) {
- null_bitmap = rng.NullBitmap(kNumValues, null_probability);
- }
- auto array = *::arrow::StructArray::Make(
- {values1, values2},
- ::arrow::FieldVector{field("a", values1->type(), nullable),
- field("b", values2->type(), nullable)},
- null_bitmap);
- auto schema = ::arrow::schema({field("s", array->type(), nullable)});
- auto table = ::arrow::Table::Make(schema, {array}, array->length());
+BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments);
- EXIT_NOT_OK(table->Validate());
+static void BM_ReadListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
+ const int64_t kBytesPerValue = sizeof(int64_t);
+
+ auto array = rng.List(*values, kNumValues / 10, null_probability);
- BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+ BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
}
-BENCHMARK(BM_ReadStructColumn)
- ->Arg(/*null_percentage=*/0)
- ->Arg(/*null_percentage=*/1)
- ->Arg(/*null_percentage=*/50)
- ->Arg(/*null_percentage=*/99);
+BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments);
-static void BM_ReadListColumn(::benchmark::State& state) {
+static void BM_ReadListOfStructColumn(::benchmark::State& state) {
constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
const double null_probability = static_cast<double>(state.range(0)) / 100.0;
const bool nullable = (null_probability != 0.0);
@@ -349,26 +453,35 @@ static void BM_ReadListColumn(::benchmark::State& state) {
::arrow::random::RandomArrayGenerator rng(42);
- auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
- auto offsets = rng.Offsets(kNumValues / 10, 0,
static_cast<int32_t>(values->length()),
- null_probability);
+ auto values = MakeStructArray(&rng, kNumValues, null_probability);
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
- const int64_t kBytesPerValue = sizeof(int64_t);
+ auto array = rng.List(*values, kNumValues / 10, null_probability);
- auto array = *::arrow::ListArray::FromArrays(*offsets, *values);
- auto schema = ::arrow::schema({field("s", array->type(), nullable)});
- auto table = ::arrow::Table::Make(schema, {array}, array->length());
+ BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
+}
- EXIT_NOT_OK(table->Validate());
+BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments);
+
+static void BM_ReadListOfListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
+ const int64_t kBytesPerValue = sizeof(int64_t);
+
+ auto inner = rng.List(*values, kNumValues / 10, null_probability);
+ auto array = rng.List(*inner, kNumValues / 100, null_probability);
- BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+ BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
}
-BENCHMARK(BM_ReadListColumn)
- ->Arg(/*null_percentage=*/0)
- ->Arg(/*null_percentage=*/1)
- ->Arg(/*null_percentage=*/50)
- ->Arg(/*null_percentage=*/99);
+BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments);
//
// Benchmark different ways of reading select row groups