This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new df83e50cdb GH-39667: [C++] Ensure dataset benchmarks present a bytes/s 
or items/s metric (#39766)
df83e50cdb is described below

commit df83e50cdbc956846476a1dbcd5f09ef7058ed58
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jan 24 11:22:51 2024 +0100

    GH-39667: [C++] Ensure dataset benchmarks present a bytes/s or items/s 
metric (#39766)
    
    ### Rationale for this change
    
    Some of our microbenchmarks only present an iteration time in 
(nano,micro...)seconds. That is usually tedious to read and difficult to 
interpret.
    
    ### What changes are included in this PR?
    
    Ensure that benchmarks present a items/seconds and/or a bytes/seconds 
metric where that makes sense.
    
    ### Are these changes tested?
    
    Manually.
    
    ### Are there any user-facing changes?
    
    No.
    * Closes: #39667
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/dataset/file_benchmark.cc | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_benchmark.cc 
b/cpp/src/arrow/dataset/file_benchmark.cc
index 8953cbd110..f687392d13 100644
--- a/cpp/src/arrow/dataset/file_benchmark.cc
+++ b/cpp/src/arrow/dataset/file_benchmark.cc
@@ -30,7 +30,12 @@
 namespace arrow {
 namespace dataset {
 
-static std::shared_ptr<Dataset> GetDataset() {
+struct SampleDataset {
+  std::shared_ptr<Dataset> dataset;
+  int64_t num_fragments;
+};
+
+static SampleDataset GetDataset() {
   std::vector<fs::FileInfo> files;
   std::vector<std::string> paths;
   for (int a = 0; a < 100; a++) {
@@ -50,25 +55,35 @@ static std::shared_ptr<Dataset> GetDataset() {
   FinishOptions finish_options;
   finish_options.inspect_options.fragments = 0;
   EXPECT_OK_AND_ASSIGN(auto dataset, factory->Finish(finish_options));
-  return dataset;
+  return {dataset, static_cast<int64_t>(paths.size())};
 }
 
 // A benchmark of filtering fragments in a dataset.
 static void GetAllFragments(benchmark::State& state) {
   auto dataset = GetDataset();
   for (auto _ : state) {
-    ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments());
+    ASSERT_OK_AND_ASSIGN(auto fragments, dataset.dataset->GetFragments());
     ABORT_NOT_OK(fragments.Visit([](std::shared_ptr<Fragment>) { return 
Status::OK(); }));
   }
+  state.SetItemsProcessed(state.iterations() * dataset.num_fragments);
+  state.counters["num_fragments"] = dataset.num_fragments;
 }
 
 static void GetFilteredFragments(benchmark::State& state, compute::Expression 
filter) {
   auto dataset = GetDataset();
-  ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*dataset->schema()));
+  ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*dataset.dataset->schema()));
+  int64_t num_filtered_fragments = 0;
   for (auto _ : state) {
-    ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments(filter));
-    ABORT_NOT_OK(fragments.Visit([](std::shared_ptr<Fragment>) { return 
Status::OK(); }));
+    num_filtered_fragments = 0;
+    ASSERT_OK_AND_ASSIGN(auto fragments, 
dataset.dataset->GetFragments(filter));
+    ABORT_NOT_OK(fragments.Visit([&](std::shared_ptr<Fragment>) {
+      ++num_filtered_fragments;
+      return Status::OK();
+    }));
   }
+  state.SetItemsProcessed(state.iterations() * dataset.num_fragments);
+  state.counters["num_fragments"] = dataset.num_fragments;
+  state.counters["num_filtered_fragments"] = num_filtered_fragments;
 }
 
 using compute::field_ref;

Reply via email to