This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new df83e50cdb GH-39667: [C++] Ensure dataset benchmarks present a bytes/s
or items/s metric (#39766)
df83e50cdb is described below
commit df83e50cdbc956846476a1dbcd5f09ef7058ed58
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jan 24 11:22:51 2024 +0100
GH-39667: [C++] Ensure dataset benchmarks present a bytes/s or items/s
metric (#39766)
### Rationale for this change
Some of our microbenchmarks only present an iteration time in
(nano,micro...)seconds. That is usually tedious to read and difficult to
interpret.
### What changes are included in this PR?
Ensure that benchmarks present a items/seconds and/or a bytes/seconds
metric where that makes sense.
### Are these changes tested?
Manually.
### Are there any user-facing changes?
No.
* Closes: #39667
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/dataset/file_benchmark.cc | 27 +++++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)
diff --git a/cpp/src/arrow/dataset/file_benchmark.cc
b/cpp/src/arrow/dataset/file_benchmark.cc
index 8953cbd110..f687392d13 100644
--- a/cpp/src/arrow/dataset/file_benchmark.cc
+++ b/cpp/src/arrow/dataset/file_benchmark.cc
@@ -30,7 +30,12 @@
namespace arrow {
namespace dataset {
-static std::shared_ptr<Dataset> GetDataset() {
+struct SampleDataset {
+ std::shared_ptr<Dataset> dataset;
+ int64_t num_fragments;
+};
+
+static SampleDataset GetDataset() {
std::vector<fs::FileInfo> files;
std::vector<std::string> paths;
for (int a = 0; a < 100; a++) {
@@ -50,25 +55,35 @@ static std::shared_ptr<Dataset> GetDataset() {
FinishOptions finish_options;
finish_options.inspect_options.fragments = 0;
EXPECT_OK_AND_ASSIGN(auto dataset, factory->Finish(finish_options));
- return dataset;
+ return {dataset, static_cast<int64_t>(paths.size())};
}
// A benchmark of filtering fragments in a dataset.
static void GetAllFragments(benchmark::State& state) {
auto dataset = GetDataset();
for (auto _ : state) {
- ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments());
+ ASSERT_OK_AND_ASSIGN(auto fragments, dataset.dataset->GetFragments());
ABORT_NOT_OK(fragments.Visit([](std::shared_ptr<Fragment>) { return
Status::OK(); }));
}
+ state.SetItemsProcessed(state.iterations() * dataset.num_fragments);
+ state.counters["num_fragments"] = dataset.num_fragments;
}
static void GetFilteredFragments(benchmark::State& state, compute::Expression
filter) {
auto dataset = GetDataset();
- ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*dataset->schema()));
+ ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*dataset.dataset->schema()));
+ int64_t num_filtered_fragments = 0;
for (auto _ : state) {
- ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments(filter));
- ABORT_NOT_OK(fragments.Visit([](std::shared_ptr<Fragment>) { return
Status::OK(); }));
+ num_filtered_fragments = 0;
+ ASSERT_OK_AND_ASSIGN(auto fragments,
dataset.dataset->GetFragments(filter));
+ ABORT_NOT_OK(fragments.Visit([&](std::shared_ptr<Fragment>) {
+ ++num_filtered_fragments;
+ return Status::OK();
+ }));
}
+ state.SetItemsProcessed(state.iterations() * dataset.num_fragments);
+ state.counters["num_fragments"] = dataset.num_fragments;
+ state.counters["num_filtered_fragments"] = num_filtered_fragments;
}
using compute::field_ref;