[arrow] branch main updated: GH-35934:[C++][Parquet] PageIndex Read benchmark (#36702)

apitrou Tue, 18 Jul 2023 09:34:22 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new f9904063b1 GH-35934:[C++][Parquet] PageIndex Read benchmark (#36702)
f9904063b1 is described below

commit f9904063b163c4ad44bef61e84a1e4a90b600d34
Author: mwish <[email protected]>
AuthorDate: Wed Jul 19 00:32:34 2023 +0800

    GH-35934:[C++][Parquet] PageIndex Read benchmark (#36702)
    
    
    
    ### Rationale for this change
    
    Add benchmark for read page index
    
    ### What changes are included in this PR?
    
    Just a benchmark in `cpp/src/parquet/bloom_filter_benchmark.cc`
    
    ### Are these changes tested?
    
    No
    
    ### Are there any user-facing changes?
    
    No
    
    * Closes: #35934
    
    Lead-authored-by: mwish <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/dataset/dataset.h               |   2 +-
 cpp/src/parquet/CMakeLists.txt                |   5 +-
 cpp/src/parquet/benchmark_util.cc             | 126 ++++++++++++++++++++++++++
 cpp/src/parquet/benchmark_util.h              |  47 ++++++++++
 cpp/src/parquet/bloom_filter_benchmark.cc     |  69 ++------------
 cpp/src/parquet/level_conversion_benchmark.cc |   2 +-
 cpp/src/parquet/page_index_benchmark.cc       | 107 ++++++++++++++++++++++
 cpp/src/parquet/test_util.h                   |   2 +-
 8 files changed, 296 insertions(+), 64 deletions(-)

diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index 1db230b16e..39936fbd7b 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -82,7 +82,7 @@ class ARROW_DS_EXPORT FragmentSelection {
 
 /// \brief Instructions for scanning a particular fragment
 ///
-/// The fragment scan request is dervied from ScanV2Options.  The main
+/// The fragment scan request is derived from ScanV2Options.  The main
 /// difference is that the scan options are based on the dataset schema
 /// while the fragment request is based on the fragment schema.
 struct ARROW_DS_EXPORT FragmentScanRequest {
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index e6aad7cee2..eb2e2d8fed 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -401,11 +401,14 @@ endif()
 add_parquet_test(file_deserialize_test SOURCES file_deserialize_test.cc 
test_util.cc)
 add_parquet_test(schema_test)
 
-add_parquet_benchmark(bloom_filter_benchmark)
+add_parquet_benchmark(bloom_filter_benchmark SOURCES bloom_filter_benchmark.cc
+                      benchmark_util.cc)
 add_parquet_benchmark(column_reader_benchmark)
 add_parquet_benchmark(column_io_benchmark)
 add_parquet_benchmark(encoding_benchmark)
 add_parquet_benchmark(level_conversion_benchmark)
+add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc
+                      benchmark_util.cc)
 add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow")
 
 if(ARROW_WITH_BROTLI)
diff --git a/cpp/src/parquet/benchmark_util.cc 
b/cpp/src/parquet/benchmark_util.cc
new file mode 100644
index 0000000000..6220336e1c
--- /dev/null
+++ b/cpp/src/parquet/benchmark_util.cc
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/benchmark_util.h"
+
+#include <random>
+
+namespace parquet::benchmark {
+
+namespace {
+
+void GenerateRandomString(uint32_t length, uint32_t seed, 
std::vector<uint8_t>* heap) {
+  // Character set used to generate random string
+  const std::string charset =
+      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+  std::default_random_engine gen(seed);
+  std::uniform_int_distribution<uint32_t> dist(0, 
static_cast<int>(charset.size() - 1));
+
+  for (uint32_t i = 0; i < length; i++) {
+    heap->emplace_back(charset[dist(gen)]);
+  }
+}
+
+template <typename T>
+void GenerateBenchmarkDataIntegerImpl(uint32_t size, uint32_t seed, T* data,
+                                      std::vector<uint8_t>* heap, uint32_t) {
+  static_assert(std::is_integral_v<T>);
+  heap->clear();
+  std::default_random_engine gen(seed);
+  std::uniform_int_distribution<T> d(std::numeric_limits<T>::min(),
+                                     std::numeric_limits<T>::max());
+  for (uint32_t i = 0; i < size; ++i) {
+    data[i] = d(gen);
+  }
+}
+
+template <typename T>
+void GenerateBenchmarkDataFloatImpl(uint32_t size, uint32_t seed, T* data,
+                                    std::vector<uint8_t>* heap, uint32_t) {
+  static_assert(std::is_floating_point_v<T>);
+  heap->clear();
+  std::default_random_engine gen(seed);
+  std::uniform_real_distribution<T> d(std::numeric_limits<T>::lowest(),
+                                      std::numeric_limits<T>::max());
+  for (uint32_t i = 0; i < size; ++i) {
+    data[i] = d(gen);
+  }
+}
+
+}  // namespace
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, int32_t* data,
+                           std::vector<uint8_t>* heap, uint32_t 
data_string_length) {
+  GenerateBenchmarkDataIntegerImpl<int32_t>(size, seed, data, heap, 
data_string_length);
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, int64_t* data,
+                           std::vector<uint8_t>* heap, uint32_t 
data_string_length) {
+  GenerateBenchmarkDataIntegerImpl<int64_t>(size, seed, data, heap, 
data_string_length);
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, float* data,
+                           std::vector<uint8_t>* heap, uint32_t 
data_string_length) {
+  GenerateBenchmarkDataFloatImpl<float>(size, seed, data, heap, 
data_string_length);
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, double* data,
+                           std::vector<uint8_t>* heap, uint32_t 
data_string_length) {
+  GenerateBenchmarkDataFloatImpl<double>(size, seed, data, heap, 
data_string_length);
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, Int96* data,
+                           std::vector<uint8_t>* heap, uint32_t) {
+  heap->clear();
+  std::default_random_engine gen(seed);
+  std::uniform_int_distribution<int> d(std::numeric_limits<int>::min(),
+                                       std::numeric_limits<int>::max());
+  for (uint32_t i = 0; i < size; ++i) {
+    data[i].value[0] = d(gen);
+    data[i].value[1] = d(gen);
+    data[i].value[2] = d(gen);
+  }
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, FLBA* data,
+                           std::vector<uint8_t>* heap, uint32_t 
data_string_length) {
+  heap->clear();
+  GenerateRandomString(data_string_length * size, seed, heap);
+  for (uint32_t i = 0; i < size; ++i) {
+    data[i].ptr = heap->data() + i * data_string_length;
+  }
+}
+
+template <>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, ByteArray* data,
+                           std::vector<uint8_t>* heap, uint32_t 
data_string_length) {
+  heap->clear();
+  GenerateRandomString(data_string_length * size, seed, heap);
+  for (uint32_t i = 0; i < size; ++i) {
+    data[i].ptr = heap->data() + i * data_string_length;
+    data[i].len = data_string_length;
+  }
+}
+
+}  // namespace parquet::benchmark
diff --git a/cpp/src/parquet/benchmark_util.h b/cpp/src/parquet/benchmark_util.h
new file mode 100644
index 0000000000..7996f7f85e
--- /dev/null
+++ b/cpp/src/parquet/benchmark_util.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <random>
+#include <string>
+#include <vector>
+
+#include "parquet/types.h"
+
+namespace parquet::benchmark {
+
+template <typename T>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data,
+                           std::vector<uint8_t>* heap, uint32_t 
data_string_length);
+
+#define _GENERATE_BENCHMARK_DATA_DECL(KLASS)                            \
+  template <>                                                           \
+  void GenerateBenchmarkData(uint32_t size, uint32_t seed, KLASS* data, \
+                             std::vector<uint8_t>* heap, uint32_t 
data_string_length);
+
+_GENERATE_BENCHMARK_DATA_DECL(int32_t)
+_GENERATE_BENCHMARK_DATA_DECL(int64_t)
+_GENERATE_BENCHMARK_DATA_DECL(float)
+_GENERATE_BENCHMARK_DATA_DECL(double)
+_GENERATE_BENCHMARK_DATA_DECL(ByteArray)
+_GENERATE_BENCHMARK_DATA_DECL(FLBA)
+_GENERATE_BENCHMARK_DATA_DECL(Int96)
+
+#undef _GENERATE_BENCHMARK_DATA_DECL
+
+}  // namespace parquet::benchmark
diff --git a/cpp/src/parquet/bloom_filter_benchmark.cc 
b/cpp/src/parquet/bloom_filter_benchmark.cc
index fa934b1d52..13c731d975 100644
--- a/cpp/src/parquet/bloom_filter_benchmark.cc
+++ b/cpp/src/parquet/bloom_filter_benchmark.cc
@@ -18,13 +18,13 @@
 #include "benchmark/benchmark.h"
 
 #include "arrow/util/logging.h"
+#include "parquet/benchmark_util.h"
 #include "parquet/bloom_filter.h"
 #include "parquet/properties.h"
 
 #include <random>
 
-namespace parquet {
-namespace benchmark {
+namespace parquet::benchmark {
 
 constexpr static uint32_t kNumBloomFilterInserts = 16 * 1024;
 // The sample string length for FLBA and ByteArray benchmarks
@@ -40,63 +40,11 @@ std::unique_ptr<BloomFilter> CreateBloomFilter(uint32_t 
num_values) {
   return bloom_filter;
 }
 
-void GenerateRandomString(uint32_t length, uint32_t seed, 
std::vector<uint8_t>* heap) {
-  // Character set used to generate random string
-  const std::string charset =
-      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-  std::default_random_engine gen(seed);
-  std::uniform_int_distribution<uint32_t> dist(0, 
static_cast<int>(charset.size() - 1));
-
-  for (uint32_t i = 0; i < length; i++) {
-    heap->push_back(charset[dist(gen)]);
-  }
-}
-
-template <typename T>
-void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data,
-                           [[maybe_unused]] std::vector<uint8_t>* heap = 
nullptr) {
-  if constexpr (std::is_integral_v<T>) {
-    std::default_random_engine gen(seed);
-    std::uniform_int_distribution<T> d(std::numeric_limits<T>::min(),
-                                       std::numeric_limits<T>::max());
-    for (uint32_t i = 0; i < size; ++i) {
-      data[i] = d(gen);
-    }
-  } else if constexpr (std::is_floating_point_v<T>) {
-    std::default_random_engine gen(seed);
-    std::uniform_real_distribution<T> d(std::numeric_limits<T>::lowest(),
-                                        std::numeric_limits<T>::max());
-    for (uint32_t i = 0; i < size; ++i) {
-      data[i] = d(gen);
-    }
-  } else if constexpr (std::is_same_v<FLBA, T>) {
-    GenerateRandomString(kDataStringLength * size, seed, heap);
-    for (uint32_t i = 0; i < size; ++i) {
-      data[i].ptr = heap->data() + i * kDataStringLength;
-    }
-  } else if constexpr (std::is_same_v<ByteArray, T>) {
-    GenerateRandomString(kDataStringLength * size, seed, heap);
-    for (uint32_t i = 0; i < size; ++i) {
-      data[i].ptr = heap->data() + i * kDataStringLength;
-      data[i].len = kDataStringLength;
-    }
-  } else if constexpr (std::is_same_v<Int96, T>) {
-    std::default_random_engine gen(seed);
-    std::uniform_int_distribution<int> d(std::numeric_limits<int>::min(),
-                                         std::numeric_limits<int>::max());
-    for (uint32_t i = 0; i < size; ++i) {
-      data[i].value[0] = d(gen);
-      data[i].value[1] = d(gen);
-      data[i].value[2] = d(gen);
-    }
-  }
-}
-
 std::vector<uint64_t> GetHashValues(uint32_t num_values, uint32_t seed) {
   // Generate sample data values
   std::vector<int64_t> values(num_values);
-  GenerateBenchmarkData(num_values, seed, values.data());
+  std::vector<uint8_t> heap;
+  GenerateBenchmarkData(num_values, seed, values.data(), &heap, 
kDataStringLength);
   // Create a temp filter to compute hash values
   auto filter = CreateBloomFilter(/*num_values=*/8);
   std::vector<uint64_t> hashes(num_values);
@@ -109,7 +57,8 @@ static void BM_ComputeHash(::benchmark::State& state) {
   using T = typename DType::c_type;
   std::vector<T> values(kNumBloomFilterInserts);
   std::vector<uint8_t> heap;
-  GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), 
&heap);
+  GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), 
&heap,
+                        kDataStringLength);
   auto filter = CreateBloomFilter(kNumBloomFilterInserts);
   for (auto _ : state) {
     uint64_t total = 0;
@@ -136,7 +85,8 @@ static void BM_BatchComputeHash(::benchmark::State& state) {
   using T = typename DType::c_type;
   std::vector<T> values(kNumBloomFilterInserts);
   std::vector<uint8_t> heap;
-  GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), 
&heap);
+  GenerateBenchmarkData(kNumBloomFilterInserts, /*seed=*/0, values.data(), 
&heap,
+                        kDataStringLength);
   auto filter = CreateBloomFilter(kNumBloomFilterInserts);
   std::vector<uint64_t> hashes(kNumBloomFilterInserts);
   for (auto _ : state) {
@@ -231,5 +181,4 @@ BENCHMARK(BM_BatchInsertHash);
 BENCHMARK(BM_FindExistingHash);
 BENCHMARK(BM_FindNonExistingHash);
 
-}  // namespace benchmark
-}  // namespace parquet
+}  // namespace parquet::benchmark
diff --git a/cpp/src/parquet/level_conversion_benchmark.cc 
b/cpp/src/parquet/level_conversion_benchmark.cc
index f9e91c4820..f3a4f8095e 100644
--- a/cpp/src/parquet/level_conversion_benchmark.cc
+++ b/cpp/src/parquet/level_conversion_benchmark.cc
@@ -29,7 +29,7 @@ constexpr int16_t kMissingDefLevel = 0;
 // Definition Level indicating the values has an entry in the leaf element.
 constexpr int16_t kPresentDefLevel = 2;
 
-// A repition level that indicates a repeated element.
+// A repetition level that indicates a repeated element.
 constexpr int16_t kHasRepeatedElements = 1;
 
 std::vector<uint8_t> RunDefinitionLevelsToBitmap(const std::vector<int16_t>& 
def_levels,
diff --git a/cpp/src/parquet/page_index_benchmark.cc 
b/cpp/src/parquet/page_index_benchmark.cc
new file mode 100644
index 0000000000..5631034105
--- /dev/null
+++ b/cpp/src/parquet/page_index_benchmark.cc
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+#include "parquet/benchmark_util.h"
+#include "parquet/metadata.h"
+#include "parquet/page_index.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/thrift_internal.h"
+
+namespace parquet::benchmark {
+
+void PageIndexSetArgs(::benchmark::internal::Benchmark* bench) {
+  bench->ArgNames({"num_pages"});
+  bench->Range(8, 1024);
+}
+
+void BM_ReadOffsetIndex(::benchmark::State& state) {
+  auto builder = OffsetIndexBuilder::Make();
+  const int num_pages = static_cast<int>(state.range(0));
+  constexpr int64_t page_size = 1024;
+  constexpr int64_t first_row_index = 10000;
+  for (int i = 0; i < num_pages; ++i) {
+    builder->AddPage(page_size * i, page_size, first_row_index * i);
+  }
+  constexpr int64_t final_position = 4096;
+  builder->Finish(final_position);
+  auto sink = CreateOutputStream();
+  builder->WriteTo(sink.get());
+  auto buffer = sink->Finish().ValueOrDie();
+  ReaderProperties properties;
+  for (auto _ : state) {
+    auto offset_index = OffsetIndex::Make(
+        buffer->data() + 0, static_cast<uint32_t>(buffer->size()), properties);
+    ::benchmark::DoNotOptimize(offset_index);
+  }
+  state.SetBytesProcessed(state.iterations() * buffer->size());
+  state.SetItemsProcessed(state.iterations() * num_pages);
+}
+
+BENCHMARK(BM_ReadOffsetIndex)->Apply(PageIndexSetArgs);
+
+// The sample string length for FLBA and ByteArray benchmarks
+constexpr static uint32_t kDataStringLength = 8;
+
+template <typename DType>
+void BM_ReadColumnIndex(::benchmark::State& state) {
+  schema::NodePtr type = ::parquet::schema::PrimitiveNode::Make(
+      "b", Repetition::OPTIONAL, DType::type_num, ConvertedType::NONE, 8);
+  auto descr_ptr =
+      std::make_unique<ColumnDescriptor>(type, /*def_level=*/1, 
/*rep_level=*/0);
+  auto descr = descr_ptr.get();
+
+  const int num_pages = static_cast<int>(state.range(0));
+  auto builder = ColumnIndexBuilder::Make(descr);
+
+  const size_t values_per_page = 100;
+  for (int i = 0; i < num_pages; ++i) {
+    auto stats = MakeStatistics<DType>(descr);
+    std::vector<uint8_t> heap;
+    std::vector<typename DType::c_type> values;
+    values.resize(values_per_page);
+    GenerateBenchmarkData(values_per_page, /*seed=*/0, values.data(), &heap,
+                          kDataStringLength);
+    stats->Update(values.data(), values_per_page, /*null_count=*/0);
+    builder->AddPage(stats->Encode());
+  }
+
+  builder->Finish();
+  auto sink = CreateOutputStream();
+  builder->WriteTo(sink.get());
+  auto buffer = sink->Finish().ValueOrDie();
+  ReaderProperties properties;
+  for (auto _ : state) {
+    auto column_index = ColumnIndex::Make(*descr, buffer->data() + 0,
+                                          static_cast<int>(buffer->size()), 
properties);
+    ::benchmark::DoNotOptimize(column_index);
+  }
+  state.SetBytesProcessed(state.iterations() * buffer->size());
+  state.SetItemsProcessed(state.iterations() * num_pages);
+}
+
+BENCHMARK_TEMPLATE(BM_ReadColumnIndex, Int64Type)->Apply(PageIndexSetArgs);
+BENCHMARK_TEMPLATE(BM_ReadColumnIndex, DoubleType)->Apply(PageIndexSetArgs);
+BENCHMARK_TEMPLATE(BM_ReadColumnIndex, FLBAType)->Apply(PageIndexSetArgs);
+BENCHMARK_TEMPLATE(BM_ReadColumnIndex, ByteArrayType)->Apply(PageIndexSetArgs);
+
+}  // namespace parquet::benchmark
diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h
index dfb4b5d0fb..b0aafa037e 100644
--- a/cpp/src/parquet/test_util.h
+++ b/cpp/src/parquet/test_util.h
@@ -556,7 +556,7 @@ static inline int MakePages(const ColumnDescriptor* d, int 
num_pages, int levels
   } else {
     num_values = num_levels;
   }
-  // Create repitition levels
+  // Create repetition levels
   if (max_rep_level > 0 && num_levels != 0) {
     rep_levels.resize(num_levels);
     // Using a different seed so that def_levels and rep_levels are different.

[arrow] branch main updated: GH-35934:[C++][Parquet] PageIndex Read benchmark (#36702)

Reply via email to