westonpace commented on code in PR #40884:
URL: https://github.com/apache/arrow/pull/40884#discussion_r1547750014


##########
cpp/src/arrow/acero/aggregate_benchmark.cc:
##########
@@ -321,6 +321,19 @@ BENCHMARK_TEMPLATE(ReferenceSum, 
SumBitmapVectorizeUnroll<int64_t>)
 // GroupBy
 //
 
+int64_t CalculateBatchSize(const std::shared_ptr<RecordBatch>& batch) {

Review Comment:
   There is a method `TotalBufferSize` in `src/arrow/util/byte_size.h` that you 
can use.



##########
cpp/src/arrow/acero/expression_benchmark.cc:
##########
@@ -41,6 +41,9 @@ std::shared_ptr<Scalar> ninety_nine_dict =
     DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(int64(), "[99]"));
 
 static void BindAndEvaluate(benchmark::State& state, Expression expr) {
+  const auto rows_per_batch = static_cast<int32_t>(state.range(0));
+  const auto num_batches = 1000000 / rows_per_batch;

Review Comment:
   Where is 1000000 coming from?  It looks like each iteration is running 
against a batch of 4 rows.



##########
cpp/src/arrow/acero/expression_benchmark.cc:
##########
@@ -66,17 +69,34 @@ static void BindAndEvaluate(benchmark::State& state, 
Expression expr) {
     ASSIGN_OR_ABORT(auto bound, expr.Bind(*dataset_schema));
     ABORT_NOT_OK(ExecuteScalarExpression(bound, input, &ctx).status());
   }
+
+  state.counters["rows_per_second"] = benchmark::Counter(
+      static_cast<double>(state.iterations() * num_batches * rows_per_batch),
+      benchmark::Counter::kIsRate);
+
+  state.counters["batches_per_second"] = benchmark::Counter(
+      static_cast<double>(state.iterations() * num_batches), 
benchmark::Counter::kIsRate);
 }
 
 // A benchmark of SimplifyWithGuarantee using expressions arising from 
partitioning.
 static void SimplifyFilterWithGuarantee(benchmark::State& state, Expression 
filter,
                                         Expression guarantee) {
+  const auto rows_per_batch = static_cast<int32_t>(state.range(0));
+  const auto num_batches = 1000000 / rows_per_batch;
+
   auto dataset_schema = schema({field("a", int64()), field("b", int64())});
   ASSIGN_OR_ABORT(filter, filter.Bind(*dataset_schema));
 
   for (auto _ : state) {
     ABORT_NOT_OK(SimplifyWithGuarantee(filter, guarantee));
   }
+
+  state.counters["rows_per_second"] = benchmark::Counter(
+      static_cast<double>(state.iterations() * num_batches * rows_per_batch),
+      benchmark::Counter::kIsRate);
+
+  state.counters["batches_per_second"] = benchmark::Counter(
+      static_cast<double>(state.iterations() * num_batches), 
benchmark::Counter::kIsRate);

Review Comment:
   This benchmark doesn't have any input.  It is simplifying an expression.  
I'm not sure there is a rows per second or batches per second that makes sense 
here.



##########
cpp/src/arrow/acero/expression_benchmark.cc:
##########
@@ -163,31 +183,103 @@ auto ref_only_expression = field_ref("x");
 
 // Negative queries (partition expressions that fail the filter)
 BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee, 
negative_filter_simple_guarantee_simple,
-                  filter_simple_negative, guarantee);
+                  filter_simple_negative, guarantee)
+    ->ArgNames({"rows_per_batch"})
+    ->RangeMultiplier(10)
+    ->Range(1000, 1000000)

Review Comment:
   I'm not sure you can add `Range` / `RangeMultiplier` here since the 
benchmark is not looking at it then it will have no effect?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to