westonpace commented on code in PR #40884:
URL: https://github.com/apache/arrow/pull/40884#discussion_r1547750014
##########
cpp/src/arrow/acero/aggregate_benchmark.cc:
##########
@@ -321,6 +321,19 @@ BENCHMARK_TEMPLATE(ReferenceSum,
SumBitmapVectorizeUnroll<int64_t>)
// GroupBy
//
+int64_t CalculateBatchSize(const std::shared_ptr<RecordBatch>& batch) {
Review Comment:
There is a method `TotalBufferSize` in `src/arrow/util/byte_size.h` that you
can use.
##########
cpp/src/arrow/acero/expression_benchmark.cc:
##########
@@ -41,6 +41,9 @@ std::shared_ptr<Scalar> ninety_nine_dict =
DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(int64(), "[99]"));
static void BindAndEvaluate(benchmark::State& state, Expression expr) {
+ const auto rows_per_batch = static_cast<int32_t>(state.range(0));
+ const auto num_batches = 1000000 / rows_per_batch;
Review Comment:
Where is 1000000 coming from? It looks like each iteration is running
against a batch of 4 rows.
##########
cpp/src/arrow/acero/expression_benchmark.cc:
##########
@@ -66,17 +69,34 @@ static void BindAndEvaluate(benchmark::State& state,
Expression expr) {
ASSIGN_OR_ABORT(auto bound, expr.Bind(*dataset_schema));
ABORT_NOT_OK(ExecuteScalarExpression(bound, input, &ctx).status());
}
+
+ state.counters["rows_per_second"] = benchmark::Counter(
+ static_cast<double>(state.iterations() * num_batches * rows_per_batch),
+ benchmark::Counter::kIsRate);
+
+ state.counters["batches_per_second"] = benchmark::Counter(
+ static_cast<double>(state.iterations() * num_batches),
benchmark::Counter::kIsRate);
}
// A benchmark of SimplifyWithGuarantee using expressions arising from
partitioning.
static void SimplifyFilterWithGuarantee(benchmark::State& state, Expression
filter,
Expression guarantee) {
+ const auto rows_per_batch = static_cast<int32_t>(state.range(0));
+ const auto num_batches = 1000000 / rows_per_batch;
+
auto dataset_schema = schema({field("a", int64()), field("b", int64())});
ASSIGN_OR_ABORT(filter, filter.Bind(*dataset_schema));
for (auto _ : state) {
ABORT_NOT_OK(SimplifyWithGuarantee(filter, guarantee));
}
+
+ state.counters["rows_per_second"] = benchmark::Counter(
+ static_cast<double>(state.iterations() * num_batches * rows_per_batch),
+ benchmark::Counter::kIsRate);
+
+ state.counters["batches_per_second"] = benchmark::Counter(
+ static_cast<double>(state.iterations() * num_batches),
benchmark::Counter::kIsRate);
Review Comment:
This benchmark doesn't have any input. It is simplifying an expression.
I'm not sure there is a rows per second or batches per second that makes sense
here.
##########
cpp/src/arrow/acero/expression_benchmark.cc:
##########
@@ -163,31 +183,103 @@ auto ref_only_expression = field_ref("x");
// Negative queries (partition expressions that fail the filter)
BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee,
negative_filter_simple_guarantee_simple,
- filter_simple_negative, guarantee);
+ filter_simple_negative, guarantee)
+ ->ArgNames({"rows_per_batch"})
+ ->RangeMultiplier(10)
+ ->Range(1000, 1000000)
Review Comment:
I'm not sure you can add `Range` / `RangeMultiplier` here since the
benchmark is not looking at it then it will have no effect?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]