ZhangHuiGui commented on code in PR #41036: URL: https://github.com/apache/arrow/pull/41036#discussion_r1567113203
########## cpp/src/arrow/compute/row/grouper_benchmark.cc: ########## @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/string.h" +#include "benchmark/benchmark.h" + +#include "arrow/compute/row/grouper.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { +namespace compute { + +constexpr auto kSeed = 0x0ff1ce; +constexpr int64_t kRound = 256; + +static ExecBatch MakeRandomExecBatch(const DataTypeVector& types, int64_t num_rows, + double null_probability, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = nullptr) { + random::RandomArrayGenerator rng(kSeed); + auto num_types = static_cast<int>(types.size()); + + // clang-format off + auto metadata = key_value_metadata( + { + "null_probability", + "true_probability", + "unique" + }, + { + internal::ToChars(null_probability), + internal::ToChars(null_probability), // for boolean type + internal::ToChars(static_cast<int32_t>(num_rows * 0.5)) // for string type + }); + // clang-format on + + std::vector<Datum> values; + values.resize(num_types); + for (int i = 0; i < num_types; ++i) { + auto field = ::arrow::field("", types[i], metadata); + values[i] = rng.ArrayOf(*field, num_rows, alignment, memory_pool); + } + + return ExecBatch(std::move(values), num_rows); +} + +static void GrouperBenchmark(benchmark::State& state, const ExecSpan& span, + ExecContext* ctx = nullptr) { + for (auto _ : state) { + ASSIGN_OR_ABORT(auto grouper, Grouper::Make(span.GetTypes(), ctx)); + for (int i = 0; i < kRound; ++i) { + ASSIGN_OR_ABORT(auto group_ids, grouper->Consume(span)); + } + } + + state.SetItemsProcessed(state.iterations() * kRound * span.length); +} + +static void GrouperWithMultiTypes(benchmark::State& state, const DataTypeVector& types) { + auto ctx = default_exec_context(); + + RegressionArgs args(state, false); + const int64_t num_rows = args.size; + const double null_proportion = args.null_proportion; + + auto exec_batch = MakeRandomExecBatch(types, num_rows, null_proportion, + kDefaultBufferAlignment, ctx->memory_pool()); + ExecSpan exec_span(exec_batch); + ASSIGN_OR_ABORT(auto grouper, Grouper::Make(exec_span.GetTypes(), ctx)); + GrouperBenchmark(state, exec_span, ctx); +} + +void SetArgs(benchmark::internal::Benchmark* bench) { + BenchmarkSetArgsWithSizes(bench, {1 << 10, 1 << 12}); +} + +// basic type column +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{boolean}", {boolean()})->Apply(SetArgs); +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32}", {int32()})->Apply(SetArgs); +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int64}", {int64()})->Apply(SetArgs); +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{utf8}", {utf8()})->Apply(SetArgs); +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{fixed_size_binary(128)}", + {fixed_size_binary(128)}) + ->Apply(SetArgs); + +// multi types' columns +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{boolean, utf8}", {boolean(), utf8()}) + ->Apply(SetArgs); +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32, int32}", {int32(), int32()}) + ->Apply(SetArgs); +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32, int64}", {int32(), int64()}) + ->Apply(SetArgs); +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{boolean, int64, utf8}", + {boolean(), int64(), utf8()}) + ->Apply(SetArgs); + +// multi types' columns with column resorted Review Comment: After i fix the bug of ` are_cols_in_encoding_order=false` in grouper. Performance like below: `are_cols_in_encoding_order=false` ```shell GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/10000 288 us 288 us 2472 items_per_second=3.55257M/s null_percent=0.01 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/100 332 us 331 us 2132 items_per_second=3.08928M/s null_percent=1 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/10 329 us 329 us 2153 items_per_second=3.11651M/s null_percent=10 num_groups=1.009k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/2 324 us 324 us 2148 items_per_second=3.16498M/s null_percent=50 num_groups=730 size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/1 359 us 359 us 1948 items_per_second=2.85145M/s null_percent=100 num_groups=1 size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/0 286 us 286 us 2413 items_per_second=3.57668M/s null_percent=0 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/10000 1282 us 1282 us 541 items_per_second=3.19499M/s null_percent=0.01 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/100 1334 us 1334 us 522 items_per_second=3.07032M/s null_percent=1 num_groups=4.092k size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/10 1330 us 1329 us 514 items_per_second=3.08117M/s null_percent=10 num_groups=3.999k size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/2 1385 us 1385 us 502 items_per_second=2.95787M/s null_percent=50 num_groups=2.779k size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/1 1400 us 1400 us 501 items_per_second=2.92583M/s null_percent=100 num_groups=1 size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/0 1152 us 1152 us 601 items_per_second=3.55699M/s null_percent=0 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/10000 330 us 330 us 2081 items_per_second=3.10442M/s null_percent=0.01 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/100 395 us 395 us 1810 items_per_second=2.5926M/s null_percent=1 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/10 395 us 395 us 1771 items_per_second=2.59343M/s null_percent=10 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/2 394 us 394 us 1782 items_per_second=2.59703M/s null_percent=50 num_groups=887 size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/1 422 us 422 us 1657 items_per_second=2.42852M/s null_percent=100 num_groups=1 size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/0 320 us 320 us 2170 items_per_second=3.20108M/s null_percent=0 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/10000 1344 us 1344 us 516 items_per_second=3.04771M/s null_percent=0.01 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/100 1576 us 1576 us 446 items_per_second=2.59919M/s null_percent=1 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/10 1650 us 1650 us 431 items_per_second=2.48289M/s null_percent=10 num_groups=4.091k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/2 1751 us 1751 us 418 items_per_second=2.33929M/s null_percent=50 num_groups=3.484k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/1 1651 us 1651 us 424 items_per_second=2.48122M/s null_percent=100 num_groups=1 size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/0 1369 us 1369 us 523 items_per_second=2.99227M/s null_percent=0 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/10000 608 us 608 us 1149 items_per_second=1.68426M/s null_percent=0.01 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/100 688 us 688 us 1191 items_per_second=1.48927M/s null_percent=1 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/10 688 us 688 us 1005 items_per_second=1.48878M/s null_percent=10 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/2 738 us 738 us 969 items_per_second=1.38746M/s null_percent=50 num_groups=965 size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/1 568 us 568 us 1237 items_per_second=1.80246M/s null_percent=100 num_groups=1 size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/0 615 us 615 us 1198 items_per_second=1.66566M/s null_percent=0 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/10000 2838 us 2837 us 248 items_per_second=1.44393M/s null_percent=0.01 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/100 2942 us 2941 us 236 items_per_second=1.39289M/s null_percent=1 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/10 3018 us 3017 us 235 items_per_second=1.3578M/s null_percent=10 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/2 3151 us 3150 us 224 items_per_second=1.3002M/s null_percent=50 num_groups=3.828k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/1 2271 us 2270 us 299 items_per_second=1.80433M/s null_percent=100 num_groups=1 size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/0 2663 us 2662 us 270 items_per_second=1.53886M/s null_percent=0 num_groups=4.096k size=4.096k ``` `are_cols_in_encoding_order=true` ```shell GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/10000 290 us 290 us 2469 items_per_second=3.52659M/s null_percent=0.01 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/100 418 us 418 us 1702 items_per_second=2.44793M/s null_percent=1 num_groups=1.27k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/10 920 us 920 us 731 items_per_second=1.11333M/s null_percent=10 num_groups=3.598k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/2 4423 us 4422 us 158 items_per_second=231.569k/s null_percent=50 num_groups=6.386k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/1 367 us 367 us 1946 items_per_second=2.79268M/s null_percent=100 num_groups=1 size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/0 279 us 279 us 2497 items_per_second=3.66686M/s null_percent=0 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/10000 1233 us 1233 us 560 items_per_second=3.32317M/s null_percent=0.01 num_groups=4.098k size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/100 1642 us 1642 us 415 items_per_second=2.49463M/s null_percent=1 num_groups=4.971k size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/10 4417 us 4415 us 158 items_per_second=927.686k/s null_percent=10 num_groups=14.036k size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/2 35053 us 35045 us 20 items_per_second=116.878k/s null_percent=50 num_groups=25.076k size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/1 1399 us 1399 us 501 items_per_second=2.92779M/s null_percent=100 num_groups=1 size=4.096k GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/0 1100 us 1100 us 613 items_per_second=3.72269M/s null_percent=0 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/10000 314 us 314 us 2263 items_per_second=3.25722M/s null_percent=0.01 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/100 506 us 506 us 1372 items_per_second=2.02275M/s null_percent=1 num_groups=1.63k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/10 1515 us 1514 us 466 items_per_second=676.307k/s null_percent=10 num_groups=5.674k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/2 4296 us 4295 us 163 items_per_second=238.424k/s null_percent=50 num_groups=11.405k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/1 432 us 432 us 1655 items_per_second=2.37068M/s null_percent=100 num_groups=1 size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/0 311 us 311 us 2239 items_per_second=3.2934M/s null_percent=0 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/10000 1265 us 1264 us 551 items_per_second=3.23969M/s null_percent=0.01 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/100 2216 us 2216 us 319 items_per_second=1.84872M/s null_percent=1 num_groups=6.139k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/10 7549 us 7547 us 91 items_per_second=542.731k/s null_percent=10 num_groups=22.322k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/2 23964 us 23956 us 29 items_per_second=170.979k/s null_percent=50 num_groups=44.868k size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/1 1663 us 1663 us 423 items_per_second=2.46312M/s null_percent=100 num_groups=1 size=4.096k GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/0 1271 us 1271 us 549 items_per_second=3.22388M/s null_percent=0 num_groups=4.096k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/10000 495 us 495 us 1389 items_per_second=2.06983M/s null_percent=0.01 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/100 802 us 802 us 883 items_per_second=1.27742M/s null_percent=1 num_groups=1.622k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/10 2523 us 2522 us 281 items_per_second=405.964k/s null_percent=10 num_groups=6.011k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/2 6945 us 6942 us 110 items_per_second=147.514k/s null_percent=50 num_groups=11.936k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/1 626 us 626 us 933 items_per_second=1.63646M/s null_percent=100 num_groups=1 size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/1024/0 513 us 513 us 1392 items_per_second=1.99517M/s null_percent=0 num_groups=1.024k size=1.024k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/10000 2359 us 2359 us 294 items_per_second=1.73639M/s null_percent=0.01 num_groups=4.1k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/100 3889 us 3888 us 178 items_per_second=1.05341M/s null_percent=1 num_groups=6.219k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/10 13720 us 13715 us 50 items_per_second=298.661k/s null_percent=10 num_groups=24k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/2 26470 us 26462 us 24 items_per_second=154.787k/s null_percent=50 num_groups=48.565k size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/1 2322 us 2322 us 302 items_per_second=1.7641M/s null_percent=100 num_groups=1 size=4.096k GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), boolean}"/4096/0 2070 us 2069 us 340 items_per_second=1.97969M/s null_percent=0 num_groups=4.096k size=4.096k ``` And there is a bug which would cause compare null with a wrong result https://github.com/apache/arrow/pull/40998 hasn't merged. So some of num_groups are wrong when `are_cols_in_encoding_order=true`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
