This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f9904063b1 GH-35934:[C++][Parquet] PageIndex Read benchmark (#36702)
f9904063b1 is described below
commit f9904063b163c4ad44bef61e84a1e4a90b600d34
Author: mwish <[email protected]>
AuthorDate: Wed Jul 19 00:32:34 2023 +0800
GH-35934:[C++][Parquet] PageIndex Read benchmark (#36702)
### Rationale for this change
Add benchmark for read page index
### What changes are included in this PR?
Just a benchmark in `cpp/src/parquet/bloom_filter_benchmark.cc`
### Are these changes tested?
No
### Are there any user-facing changes?
No
* Closes: #35934
Lead-authored-by: mwish <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/dataset/dataset.h | 2 +-
cpp/src/parquet/CMakeLists.txt | 5 +-
cpp/src/parquet/benchmark_util.cc | 126 ++++++++++++++++++++++++++
cpp/src/parquet/benchmark_util.h | 47 ++++++++++
cpp/src/parquet/bloom_filter_benchmark.cc | 69 ++------------
cpp/src/parquet/level_conversion_benchmark.cc | 2 +-
cpp/src/parquet/page_index_benchmark.cc | 107 ++++++++++++++++++++++
cpp/src/parquet/test_util.h | 2 +-
8 files changed, 296 insertions(+), 64 deletions(-)
diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index 1db230b16e..39936fbd7b 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -82,7 +82,7 @@ class ARROW_DS_EXPORT FragmentSelection {
/// \brief Instructions for scanning a particular fragment
///
-/// The fragment scan request is dervied from ScanV2Options. The main
+/// The fragment scan request is derived from ScanV2Options. The main
/// difference is that the scan options are based on the dataset schema
/// while the fragment request is based on the fragment schema.
struct ARROW_DS_EXPORT FragmentScanRequest {
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index e6aad7cee2..eb2e2d8fed 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -401,11 +401,14 @@ endif()
add_parquet_test(file_deserialize_test SOURCES file_deserialize_test.cc
test_util.cc)
add_parquet_test(schema_test)
-add_parquet_benchmark(bloom_filter_benchmark)
+add_parquet_benchmark(bloom_filter_benchmark SOURCES bloom_filter_benchmark.cc
+ benchmark_util.cc)
add_parquet_benchmark(column_reader_benchmark)
add_parquet_benchmark(column_io_benchmark)
add_parquet_benchmark(encoding_benchmark)
add_parquet_benchmark(level_conversion_benchmark)
+add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc
+ benchmark_util.cc)
add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow")
if(ARROW_WITH_BROTLI)
diff --git a/cpp/src/parquet/benchmark_util.cc
b/cpp/src/parquet/benchmark_util.cc
new file mode 100644
index 0000000000..6220336e1c
--- /dev/null
+++ b/cpp/src/parquet/benchmark_util.cc
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/benchmark_util.h"
+
+#include <random>
+
+namespace parquet::benchmark {
+
+namespace {
+
+void GenerateRandomString(uint32_t length, uint32_t seed,
std::vector<uint8_t>* heap) {
+ // Character set used to generate random string
+ const std::string charset =
+ "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+ std::default_random_engine gen(seed);
+ std::uniform_int_distribution<uint32_t> dist(0,
static_cast<int>(charset.size() - 1));
+
+ for (uint32_t i = 0; i < length; i++) {
+ heap->emplace_back(charset[dist(gen)]);
+ }
+}
+
+template <typename T>
+void GenerateBenchmarkDataIntegerImpl(uint32_t size, uint32_t seed, T* data,
+ std::vector<uint8_t>* heap, uint32_t) {
+ static_assert(std::is_integral_v<T>);
+ heap->clear();
+ std::default_random_engine gen(seed);
+ std::uniform_int_distribution<T> d(std::numeric_limits<T>::min(),
+ std::numeric_limits<T>::max());
+ for (uint32_t i = 0; i < size; ++i) {
+ data[i] = d(gen);
+ }
+}
+
+template <typename T>
+void GenerateBenchmarkDataFloatImpl(uint32_t size, uint32_t seed, T* data,
+ std::vector<uint8_t>* heap, uint32_t) {
+ static_assert(std::is_floating_point_v<T>);
+ heap->clear();
+ std::default_random_engine gen(seed);
+ std::uniform_real_distribution<T> d(std::numeric_limits<T>::lowest(),
+ std::numeric_limits<T>::max());
+ for (uint32_t i = 0; i < size; ++i) {
+ data[i] = d(gen);
+ }
+}
+
+} // namespace
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, int32_t* data,
+ std::vector<uint8_t>* heap, uint32_t
data_string_length) {
+ GenerateBenchmarkDataIntegerImpl<int32_t>(size, seed, data, heap,
data_string_length);
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, int64_t* data,
+ std::vector<uint8_t>* heap, uint32_t
data_string_length) {
+ GenerateBenchmarkDataIntegerImpl<int64_t>(size, seed, data, heap,
data_string_length);
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, float* data,
+ std::vector<uint8_t>* heap, uint32_t
data_string_length) {
+ GenerateBenchmarkDataFloatImpl<float>(size, seed, data, heap,
data_string_length);
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, double* data,
+ std::vector<uint8_t>* heap, uint32_t
data_string_length) {
+ GenerateBenchmarkDataFloatImpl<double>(size, seed, data, heap,
data_string_length);
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, Int96* data,
+ std::vector<uint8_t>* heap, uint32_t) {
+ heap->clear();
+ std::default_random_engine gen(seed);
+ std::uniform_int_distribution<int> d(std::numeric_limits<int>::min(),
+ std::numeric_limits<int>::max());
+ for (uint32_t i = 0; i < size; ++i) {
+ data[i].value[0] = d(gen);
+ data[i].value[1] = d(gen);
+ data[i].value[2] = d(gen);
+ }
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, FLBA* data,
+ std::vector<uint8_t>* heap, uint32_t
data_string_length) {
+ heap->clear();
+ GenerateRandomString(data_string_length * size, seed, heap);
+ for (uint32_t i = 0; i < size; ++i) {
+ data[i].ptr = heap->data() + i * data_string_length;
+ }
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, ByteArray* data,
+ std::vector<uint8_t>* heap, uint32_t
data_string_length) {
+ heap->clear();
+ GenerateRandomString(data_string_length * size, seed, heap);
+ for (uint32_t i = 0; i < size; ++i) {
+ data[i].ptr = heap->data() + i * data_string_length;
+ data[i].len = data_string_length;
+ }
+}
+
+} // namespace parquet::benchmark
diff --git a/cpp/src/parquet/benchmark_util.h b/cpp/src/parquet/benchmark_util.h
new file mode 100644
index 0000000000..7996f7f85e
--- /dev/null
+++ b/cpp/src/parquet/benchmark_util.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <random>
+#include <string>
+#include <vector>
+
+#include "parquet/types.h"
+
+namespace parquet::benchmark {
+
+template <typename T>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data,
+ std::vector<uint8_t>* heap, uint32_t
data_string_length);
+
+#define _GENERATE_BENCHMARK_DATA_DECL(KLASS) \
+ template <> \
+ void GenerateBenchmarkData(uint32_t size, uint32_t seed, KLASS* data, \
+ std::vector<uint8_t>* heap, uint32_t
data_string_length);
+
+_GENERATE_BENCHMARK_DATA_DECL(int32_t)
+_GENERATE_BENCHMARK_DATA_DECL(int64_t)
+_GENERATE_BENCHMARK_DATA_DECL(float)
+_GENERATE_BENCHMARK_DATA_DECL(double)
+_GENERATE_BENCHMARK_DATA_DECL(ByteArray)
+_GENERATE_BENCHMARK_DATA_DECL(FLBA)
+_GENERATE_BENCHMARK_DATA_DECL(Int96)
+
+#undef _GENERATE_BENCHMARK_DATA_DECL
+
+} // namespace parquet::benchmark
diff --git a/cpp/src/parquet/bloom_filter_benchmark.cc
b/cpp/src/parquet/bloom_filter_benchmark.cc
index fa934b1d52..13c731d975 100644
--- a/cpp/src/parquet/bloom_filter_benchmark.cc
+++ b/cpp/src/parquet/bloom_filter_benchmark.cc
@@ -18,13 +18,13 @@
#include "benchmark/benchmark.h"
#include "arrow/util/logging.h"
+#include "parquet/benchmark_util.h"
#include "parquet/bloom_filter.h"
#include "parquet/properties.h"
#include <random>
-namespace parquet {
-namespace benchmark {
+namespace parquet::benchmark {
constexpr static uint32_t kNumBloomFilterInserts = 16 * 1024;
// The sample string length for FLBA and ByteArray benchmarks
@@ -40,63 +40,11 @@ std::unique_ptr<BloomFilter> CreateBloomFilter(uint32_t
num_values) {
return bloom_filter;
}
-void GenerateRandomString(uint32_t length, uint32_t seed,
std::vector<uint8_t>* heap) {
- // Character set used to generate random string
- const std::string charset =
- "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
- std::default_random_engine gen(seed);
- std::uniform_int_distribution<uint32_t> dist(0,
static_cast<int>(charset.size() - 1));
-
- for (uint32_t i = 0; i < length; i++) {
- heap->push_back(charset[dist(gen)]);
- }
-}
-
-template <typename T>
-void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data,
- [[maybe_unused]] std::vector<uint8_t>* heap =
nullptr) {
- if constexpr (std::is_integral_v<T>) {
- std::default_random_engine gen(seed);
- std::uniform_int_distribution<T> d(std::numeric_limits<T>::min(),
- std::numeric_limits<T>::max());
- for (uint32_t i = 0; i < size; ++i) {
- data[i] = d(gen);
- }
- } else if constexpr (std::is_floating_point_v<T>) {
- std::default_random_engine gen(seed);
- std::uniform_real_distribution<T> d(std::numeric_limits<T>::lowest(),
- std::numeric_limits<T>::max());
- for (uint32_t i = 0; i < size; ++i) {
- data[i] = d(gen);
- }
- } else if constexpr (std::is_same_v<FLBA, T>) {
- GenerateRandomString(kDataStringLength * size, seed, heap);
- for (uint32_t i = 0; i < size; ++i) {
- data[i].ptr = heap->data() + i * kDataStringLength;
- }
- } else if constexpr (std::is_same_v<ByteArray, T>) {
- GenerateRandomString(kDataStringLength * size, seed, heap);
- for (uint32_t i = 0; i < size; ++i) {
- data[i].ptr = heap->data() + i * kDataStringLength;
- data[i].len = kDataStringLength;
- }
- } else if constexpr (std::is_same_v<Int96, T>) {
- std::default_random_engine gen(seed);
- std::uniform_int_distribution<int> d(std::numeric_limits<int>::min(),
- std::numeric_limits<int>::max());
- for (uint32_t i = 0; i < size; ++i) {
- data[i].value[0] = d(gen);
- data[i].value[1] = d(gen);
- data[i].value[2] = d(gen);
- }
- }
-}
-
std::vector<uint64_t> GetHashValues(uint32_t num_values, uint32_t seed) {
// Generate sample data values
std::vector<int64_t> values(num_values);
- GenerateBenchmarkData(num_values, seed, values.data());
+ std::vector<uint8_t> heap;
+ GenerateBenchmarkData(num_values, seed, values.data(), &heap,
kDataStringLength);
// Create a temp filter to compute hash values
auto filter = CreateBloomFilter(/*num_values=*/8);
std::vector<uint64_t> hashes(num_values);
@@ -109,7 +57,8 @@ static void BM_ComputeHash(::benchmark::State& state) {
using T = typename DType::c_type;
std::vector<T> values(kNumBloomFilterInserts);
std::vector<uint8_t> heap;
- GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(),
&heap);
+ GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(),
&heap,
+ kDataStringLength);
auto filter = CreateBloomFilter(kNumBloomFilterInserts);
for (auto _ : state) {
uint64_t total = 0;
@@ -136,7 +85,8 @@ static void BM_BatchComputeHash(::benchmark::State& state) {
using T = typename DType::c_type;
std::vector<T> values(kNumBloomFilterInserts);
std::vector<uint8_t> heap;
- GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(),
&heap);
+ GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(),
&heap,
+ kDataStringLength);
auto filter = CreateBloomFilter(kNumBloomFilterInserts);
std::vector<uint64_t> hashes(kNumBloomFilterInserts);
for (auto _ : state) {
@@ -231,5 +181,4 @@ BENCHMARK(BM_BatchInsertHash);
BENCHMARK(BM_FindExistingHash);
BENCHMARK(BM_FindNonExistingHash);
-} // namespace benchmark
-} // namespace parquet
+} // namespace parquet::benchmark
diff --git a/cpp/src/parquet/level_conversion_benchmark.cc
b/cpp/src/parquet/level_conversion_benchmark.cc
index f9e91c4820..f3a4f8095e 100644
--- a/cpp/src/parquet/level_conversion_benchmark.cc
+++ b/cpp/src/parquet/level_conversion_benchmark.cc
@@ -29,7 +29,7 @@ constexpr int16_t kMissingDefLevel = 0;
// Definition Level indicating the values has an entry in the leaf element.
constexpr int16_t kPresentDefLevel = 2;
-// A repition level that indicates a repeated element.
+// A repetition level that indicates a repeated element.
constexpr int16_t kHasRepeatedElements = 1;
std::vector<uint8_t> RunDefinitionLevelsToBitmap(const std::vector<int16_t>&
def_levels,
diff --git a/cpp/src/parquet/page_index_benchmark.cc
b/cpp/src/parquet/page_index_benchmark.cc
new file mode 100644
index 0000000000..5631034105
--- /dev/null
+++ b/cpp/src/parquet/page_index_benchmark.cc
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+#include "parquet/benchmark_util.h"
+#include "parquet/metadata.h"
+#include "parquet/page_index.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/thrift_internal.h"
+
+namespace parquet::benchmark {
+
+void PageIndexSetArgs(::benchmark::internal::Benchmark* bench) {
+ bench->ArgNames({"num_pages"});
+ bench->Range(8, 1024);
+}
+
+void BM_ReadOffsetIndex(::benchmark::State& state) {
+ auto builder = OffsetIndexBuilder::Make();
+ const int num_pages = static_cast<int>(state.range(0));
+ constexpr int64_t page_size = 1024;
+ constexpr int64_t first_row_index = 10000;
+ for (int i = 0; i < num_pages; ++i) {
+ builder->AddPage(page_size * i, page_size, first_row_index * i);
+ }
+ constexpr int64_t final_position = 4096;
+ builder->Finish(final_position);
+ auto sink = CreateOutputStream();
+ builder->WriteTo(sink.get());
+ auto buffer = sink->Finish().ValueOrDie();
+ ReaderProperties properties;
+ for (auto _ : state) {
+ auto offset_index = OffsetIndex::Make(
+ buffer->data() + 0, static_cast<uint32_t>(buffer->size()), properties);
+ ::benchmark::DoNotOptimize(offset_index);
+ }
+ state.SetBytesProcessed(state.iterations() * buffer->size());
+ state.SetItemsProcessed(state.iterations() * num_pages);
+}
+
+BENCHMARK(BM_ReadOffsetIndex)->Apply(PageIndexSetArgs);
+
+// The sample string length for FLBA and ByteArray benchmarks
+constexpr static uint32_t kDataStringLength = 8;
+
+template <typename DType>
+void BM_ReadColumnIndex(::benchmark::State& state) {
+ schema::NodePtr type = ::parquet::schema::PrimitiveNode::Make(
+ "b", Repetition::OPTIONAL, DType::type_num, ConvertedType::NONE, 8);
+ auto descr_ptr =
+ std::make_unique<ColumnDescriptor>(type, /*def_level=*/1,
/*rep_level=*/0);
+ auto descr = descr_ptr.get();
+
+ const int num_pages = static_cast<int>(state.range(0));
+ auto builder = ColumnIndexBuilder::Make(descr);
+
+ const size_t values_per_page = 100;
+ for (int i = 0; i < num_pages; ++i) {
+ auto stats = MakeStatistics<DType>(descr);
+ std::vector<uint8_t> heap;
+ std::vector<typename DType::c_type> values;
+ values.resize(values_per_page);
+ GenerateBenchmarkData(values_per_page, /*seed=*/0, values.data(), &heap,
+ kDataStringLength);
+ stats->Update(values.data(), values_per_page, /*null_count=*/0);
+ builder->AddPage(stats->Encode());
+ }
+
+ builder->Finish();
+ auto sink = CreateOutputStream();
+ builder->WriteTo(sink.get());
+ auto buffer = sink->Finish().ValueOrDie();
+ ReaderProperties properties;
+ for (auto _ : state) {
+ auto column_index = ColumnIndex::Make(*descr, buffer->data() + 0,
+ static_cast<int>(buffer->size()),
properties);
+ ::benchmark::DoNotOptimize(column_index);
+ }
+ state.SetBytesProcessed(state.iterations() * buffer->size());
+ state.SetItemsProcessed(state.iterations() * num_pages);
+}
+
+BENCHMARK_TEMPLATE(BM_ReadColumnIndex, Int64Type)->Apply(PageIndexSetArgs);
+BENCHMARK_TEMPLATE(BM_ReadColumnIndex, DoubleType)->Apply(PageIndexSetArgs);
+BENCHMARK_TEMPLATE(BM_ReadColumnIndex, FLBAType)->Apply(PageIndexSetArgs);
+BENCHMARK_TEMPLATE(BM_ReadColumnIndex, ByteArrayType)->Apply(PageIndexSetArgs);
+
+} // namespace parquet::benchmark
diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h
index dfb4b5d0fb..b0aafa037e 100644
--- a/cpp/src/parquet/test_util.h
+++ b/cpp/src/parquet/test_util.h
@@ -556,7 +556,7 @@ static inline int MakePages(const ColumnDescriptor* d, int
num_pages, int levels
} else {
num_values = num_levels;
}
- // Create repitition levels
+ // Create repetition levels
if (max_rep_level > 0 && num_levels != 0) {
rep_levels.resize(num_levels);
// Using a different seed so that def_levels and rep_levels are different.