fatemehp commented on code in PR #17877:
URL: https://github.com/apache/arrow/pull/17877#discussion_r1119358803
##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -123,35 +142,82 @@ static void
ColumnReaderReadBatchInt32(::benchmark::State& state) {
state.SetBytesProcessed(state.iterations() * helper.total_size());
}
+// Benchmarks ReadRecords for RecordReader with the following parameters in
order:
+// - repetition: 0 for REQUIRED, 1 for OPTIONAL, 2 for REPEATED.
+// - batch_size: sets how many values to read at each call.
+// - read_dense_for_nullable: sets reading dense or spaced.
+static void RecordReaderReadRecords(::benchmark::State& state) {
+ const auto repetition = static_cast<Repetition::type>(state.range(0));
+ const auto batch_size = static_cast<int64_t>(state.range(1));
+ const bool read_dense_for_nullable = state.range(2);
+
+ BenchmarkHelper helper(repetition, /*num_pages=*/16,
/*levels_per_page=*/80000);
+
+ // Vectors to read the values into.
+ for (auto _ : state) {
+ state.PauseTiming();
+ RecordReader* reader = helper.ResetRecordReader(read_dense_for_nullable);
+ int64_t records_read = -1;
+ state.ResumeTiming();
+ while (records_read != 0) {
+ DoNotOptimize(records_read = reader->ReadRecords(batch_size));
+ reader->Reset();
+ }
+ }
+
+ state.SetBytesProcessed(state.iterations() * helper.total_size());
+}
+
+// Benchmarks SkipRecords for RecordReader with the following parameters in
order:
+// - repetition: 0 for REQUIRED, 1 for OPTIONAL, 2 for REPEATED.
+// - batch_size: sets how many values to read at each call.
+static void RecordReaderSkipRecords(::benchmark::State& state) {
+ const auto repetition = static_cast<Repetition::type>(state.range(0));
+ const auto batch_size = static_cast<int64_t>(state.range(1));
+
+ BenchmarkHelper helper(repetition, /*num_pages=*/16,
/*levels_per_page=*/80000);
+
+ // Vectors to read the values into.
+ for (auto _ : state) {
+ state.PauseTiming();
+ RecordReader* reader =
helper.ResetRecordReader(/*read_dense_for_nullable=*/true);
+ int64_t records_skipped = -1;
+ state.ResumeTiming();
+ while (records_skipped != 0) {
+ DoNotOptimize(records_skipped = reader->SkipRecords(batch_size));
+ reader->Reset();
+ }
+ }
+
+ state.SetBytesProcessed(state.iterations() * helper.total_size());
+}
+
BENCHMARK(ColumnReaderSkipInt32)
->ArgNames({"Repetition", "BatchSize"})
- ->Args({0, 100})
->Args({0, 1000})
- ->Args({0, 10000})
- ->Args({0, 100000})
- ->Args({1, 100})
->Args({1, 1000})
- ->Args({1, 10000})
- ->Args({1, 100000})
- ->Args({2, 100})
- ->Args({2, 1000})
- ->Args({2, 10000})
- ->Args({2, 100000});
+ ->Args({2, 1000});
BENCHMARK(ColumnReaderReadBatchInt32)
->ArgNames({"Repetition", "BatchSize"})
- ->Args({0, 100})
->Args({0, 1000})
- ->Args({0, 10000})
- ->Args({0, 100000})
- ->Args({1, 100})
->Args({1, 1000})
- ->Args({1, 10000})
- ->Args({1, 100000})
- ->Args({2, 100})
- ->Args({2, 1000})
- ->Args({2, 10000})
- ->Args({2, 100000});
+ ->Args({2, 1000});
+
+BENCHMARK(RecordReaderSkipRecords)
Review Comment:
Just to keep it less cluttered. The previous benchmark showed a batch size
of ~1000 has reasonable performance, so using that moving forward. One could
always benchmark with different batch sizes.
##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -123,35 +142,82 @@ static void
ColumnReaderReadBatchInt32(::benchmark::State& state) {
state.SetBytesProcessed(state.iterations() * helper.total_size());
}
+// Benchmarks ReadRecords for RecordReader with the following parameters in
order:
+// - repetition: 0 for REQUIRED, 1 for OPTIONAL, 2 for REPEATED.
+// - batch_size: sets how many values to read at each call.
+// - read_dense_for_nullable: sets reading dense or spaced.
+static void RecordReaderReadRecords(::benchmark::State& state) {
+ const auto repetition = static_cast<Repetition::type>(state.range(0));
+ const auto batch_size = static_cast<int64_t>(state.range(1));
+ const bool read_dense_for_nullable = state.range(2);
+
+ BenchmarkHelper helper(repetition, /*num_pages=*/16,
/*levels_per_page=*/80000);
+
+ // Vectors to read the values into.
+ for (auto _ : state) {
+ state.PauseTiming();
+ RecordReader* reader = helper.ResetRecordReader(read_dense_for_nullable);
+ int64_t records_read = -1;
+ state.ResumeTiming();
+ while (records_read != 0) {
+ DoNotOptimize(records_read = reader->ReadRecords(batch_size));
+ reader->Reset();
+ }
+ }
+
+ state.SetBytesProcessed(state.iterations() * helper.total_size());
+}
+
+// Benchmarks SkipRecords for RecordReader with the following parameters in
order:
+// - repetition: 0 for REQUIRED, 1 for OPTIONAL, 2 for REPEATED.
+// - batch_size: sets how many values to read at each call.
+static void RecordReaderSkipRecords(::benchmark::State& state) {
+ const auto repetition = static_cast<Repetition::type>(state.range(0));
+ const auto batch_size = static_cast<int64_t>(state.range(1));
+
+ BenchmarkHelper helper(repetition, /*num_pages=*/16,
/*levels_per_page=*/80000);
+
+ // Vectors to read the values into.
+ for (auto _ : state) {
+ state.PauseTiming();
+ RecordReader* reader =
helper.ResetRecordReader(/*read_dense_for_nullable=*/true);
Review Comment:
Set to false.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]