ZhangHuiGui commented on code in PR #41036:
URL: https://github.com/apache/arrow/pull/41036#discussion_r1567113203


##########
cpp/src/arrow/compute/row/grouper_benchmark.cc:
##########
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/string.h"
+#include "benchmark/benchmark.h"
+
+#include "arrow/compute/row/grouper.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/util/benchmark_util.h"
+
+namespace arrow {
+namespace compute {
+
+constexpr auto kSeed = 0x0ff1ce;
+constexpr int64_t kRound = 256;
+
+static ExecBatch MakeRandomExecBatch(const DataTypeVector& types, int64_t 
num_rows,
+                                     double null_probability,
+                                     int64_t alignment = 
kDefaultBufferAlignment,
+                                     MemoryPool* memory_pool = nullptr) {
+  random::RandomArrayGenerator rng(kSeed);
+  auto num_types = static_cast<int>(types.size());
+
+  // clang-format off
+  auto metadata = key_value_metadata(
+      {
+        "null_probability",
+        "true_probability",
+        "unique"
+      },
+      {
+          internal::ToChars(null_probability),
+          internal::ToChars(null_probability),                     // for 
boolean type
+          internal::ToChars(static_cast<int32_t>(num_rows * 0.5))  // for 
string type
+      });
+  // clang-format on
+
+  std::vector<Datum> values;
+  values.resize(num_types);
+  for (int i = 0; i < num_types; ++i) {
+    auto field = ::arrow::field("", types[i], metadata);
+    values[i] = rng.ArrayOf(*field, num_rows, alignment, memory_pool);
+  }
+
+  return ExecBatch(std::move(values), num_rows);
+}
+
+static void GrouperBenchmark(benchmark::State& state, const ExecSpan& span,
+                             ExecContext* ctx = nullptr) {
+  for (auto _ : state) {
+    ASSIGN_OR_ABORT(auto grouper, Grouper::Make(span.GetTypes(), ctx));
+    for (int i = 0; i < kRound; ++i) {
+      ASSIGN_OR_ABORT(auto group_ids, grouper->Consume(span));
+    }
+  }
+
+  state.SetItemsProcessed(state.iterations() * kRound * span.length);
+}
+
+static void GrouperWithMultiTypes(benchmark::State& state, const 
DataTypeVector& types) {
+  auto ctx = default_exec_context();
+
+  RegressionArgs args(state, false);
+  const int64_t num_rows = args.size;
+  const double null_proportion = args.null_proportion;
+
+  auto exec_batch = MakeRandomExecBatch(types, num_rows, null_proportion,
+                                        kDefaultBufferAlignment, 
ctx->memory_pool());
+  ExecSpan exec_span(exec_batch);
+  ASSIGN_OR_ABORT(auto grouper, Grouper::Make(exec_span.GetTypes(), ctx));
+  GrouperBenchmark(state, exec_span, ctx);
+}
+
+void SetArgs(benchmark::internal::Benchmark* bench) {
+  BenchmarkSetArgsWithSizes(bench, {1 << 10, 1 << 12});
+}
+
+// basic type column
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{boolean}", 
{boolean()})->Apply(SetArgs);
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32}", {int32()})->Apply(SetArgs);
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int64}", {int64()})->Apply(SetArgs);
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{utf8}", {utf8()})->Apply(SetArgs);
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{fixed_size_binary(128)}",
+                  {fixed_size_binary(128)})
+    ->Apply(SetArgs);
+
+// multi types' columns
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{boolean, utf8}", {boolean(), 
utf8()})
+    ->Apply(SetArgs);
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32, int32}", {int32(), int32()})
+    ->Apply(SetArgs);
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32, int64}", {int32(), int64()})
+    ->Apply(SetArgs);
+BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{boolean, int64, utf8}",
+                  {boolean(), int64(), utf8()})
+    ->Apply(SetArgs);
+
+// multi types' columns with column resorted

Review Comment:
   After i fix the bug of ` are_cols_in_encoding_order=false` in grouper. 
Performance like below:
   
   `are_cols_in_encoding_order=false`
   ```shell
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/10000                    
                   288 us          288 us         2472 
items_per_second=3.55257M/s null_percent=0.01 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/100                      
                   332 us          331 us         2132 
items_per_second=3.08928M/s null_percent=1 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/10                       
                   329 us          329 us         2153 
items_per_second=3.11651M/s null_percent=10 num_groups=1.009k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/2                        
                   324 us          324 us         2148 
items_per_second=3.16498M/s null_percent=50 num_groups=730 size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/1                        
                   359 us          359 us         1948 
items_per_second=2.85145M/s null_percent=100 num_groups=1 size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/0                        
                   286 us          286 us         2413 
items_per_second=3.57668M/s null_percent=0 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/10000                    
                  1282 us         1282 us          541 
items_per_second=3.19499M/s null_percent=0.01 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/100                      
                  1334 us         1334 us          522 
items_per_second=3.07032M/s null_percent=1 num_groups=4.092k size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/10                       
                  1330 us         1329 us          514 
items_per_second=3.08117M/s null_percent=10 num_groups=3.999k size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/2                        
                  1385 us         1385 us          502 
items_per_second=2.95787M/s null_percent=50 num_groups=2.779k size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/1                        
                  1400 us         1400 us          501 
items_per_second=2.92583M/s null_percent=100 num_groups=1 size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/0                        
                  1152 us         1152 us          601 
items_per_second=3.55699M/s null_percent=0 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/10000             
                   330 us          330 us         2081 
items_per_second=3.10442M/s null_percent=0.01 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/100               
                   395 us          395 us         1810 
items_per_second=2.5926M/s null_percent=1 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/10                
                   395 us          395 us         1771 
items_per_second=2.59343M/s null_percent=10 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/2                 
                   394 us          394 us         1782 
items_per_second=2.59703M/s null_percent=50 num_groups=887 size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/1                 
                   422 us          422 us         1657 
items_per_second=2.42852M/s null_percent=100 num_groups=1 size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/0                 
                   320 us          320 us         2170 
items_per_second=3.20108M/s null_percent=0 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/10000             
                  1344 us         1344 us          516 
items_per_second=3.04771M/s null_percent=0.01 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/100               
                  1576 us         1576 us          446 
items_per_second=2.59919M/s null_percent=1 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/10                
                  1650 us         1650 us          431 
items_per_second=2.48289M/s null_percent=10 num_groups=4.091k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/2                 
                  1751 us         1751 us          418 
items_per_second=2.33929M/s null_percent=50 num_groups=3.484k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/1                 
                  1651 us         1651 us          424 
items_per_second=2.48122M/s null_percent=100 num_groups=1 size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/0                 
                  1369 us         1369 us          523 
items_per_second=2.99227M/s null_percent=0 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/10000        608 us          608 us         1149 
items_per_second=1.68426M/s null_percent=0.01 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/100          688 us          688 us         1191 
items_per_second=1.48927M/s null_percent=1 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/10           688 us          688 us         1005 
items_per_second=1.48878M/s null_percent=10 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/2            738 us          738 us          969 
items_per_second=1.38746M/s null_percent=50 num_groups=965 size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/1            568 us          568 us         1237 
items_per_second=1.80246M/s null_percent=100 num_groups=1 size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/0            615 us          615 us         1198 
items_per_second=1.66566M/s null_percent=0 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/10000       2838 us         2837 us          248 
items_per_second=1.44393M/s null_percent=0.01 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/100         2942 us         2941 us          236 
items_per_second=1.39289M/s null_percent=1 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/10          3018 us         3017 us          235 
items_per_second=1.3578M/s null_percent=10 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/2           3151 us         3150 us          224 
items_per_second=1.3002M/s null_percent=50 num_groups=3.828k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/1           2271 us         2270 us          299 
items_per_second=1.80433M/s null_percent=100 num_groups=1 size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/0           2663 us         2662 us          270 
items_per_second=1.53886M/s null_percent=0 num_groups=4.096k size=4.096k
   ```
   `are_cols_in_encoding_order=true`
   ```shell
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/10000                    
                   290 us          290 us         2469 
items_per_second=3.52659M/s null_percent=0.01 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/100                      
                   418 us          418 us         1702 
items_per_second=2.44793M/s null_percent=1 num_groups=1.27k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/10                       
                   920 us          920 us          731 
items_per_second=1.11333M/s null_percent=10 num_groups=3.598k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/2                        
                  4423 us         4422 us          158 
items_per_second=231.569k/s null_percent=50 num_groups=6.386k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/1                        
                   367 us          367 us         1946 
items_per_second=2.79268M/s null_percent=100 num_groups=1 size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/1024/0                        
                   279 us          279 us         2497 
items_per_second=3.66686M/s null_percent=0 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/10000                    
                  1233 us         1233 us          560 
items_per_second=3.32317M/s null_percent=0.01 num_groups=4.098k size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/100                      
                  1642 us         1642 us          415 
items_per_second=2.49463M/s null_percent=1 num_groups=4.971k size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/10                       
                  4417 us         4415 us          158 
items_per_second=927.686k/s null_percent=10 num_groups=14.036k size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/2                        
                 35053 us        35045 us           20 
items_per_second=116.878k/s null_percent=50 num_groups=25.076k size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/1                        
                  1399 us         1399 us          501 
items_per_second=2.92779M/s null_percent=100 num_groups=1 size=4.096k
   GrouperWithMultiTypes/"{int32, boolean, utf8}"/4096/0                        
                  1100 us         1100 us          613 
items_per_second=3.72269M/s null_percent=0 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/10000             
                   314 us          314 us         2263 
items_per_second=3.25722M/s null_percent=0.01 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/100               
                   506 us          506 us         1372 
items_per_second=2.02275M/s null_percent=1 num_groups=1.63k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/10                
                  1515 us         1514 us          466 
items_per_second=676.307k/s null_percent=10 num_groups=5.674k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/2                 
                  4296 us         4295 us          163 
items_per_second=238.424k/s null_percent=50 num_groups=11.405k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/1                 
                   432 us          432 us         1655 
items_per_second=2.37068M/s null_percent=100 num_groups=1 size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/1024/0                 
                   311 us          311 us         2239 
items_per_second=3.2934M/s null_percent=0 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/10000             
                  1265 us         1264 us          551 
items_per_second=3.23969M/s null_percent=0.01 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/100               
                  2216 us         2216 us          319 
items_per_second=1.84872M/s null_percent=1 num_groups=6.139k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/10                
                  7549 us         7547 us           91 
items_per_second=542.731k/s null_percent=10 num_groups=22.322k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/2                 
                 23964 us        23956 us           29 
items_per_second=170.979k/s null_percent=50 num_groups=44.868k size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/1                 
                  1663 us         1663 us          423 
items_per_second=2.46312M/s null_percent=100 num_groups=1 size=4.096k
   GrouperWithMultiTypes/"{int32, int64, boolean, utf8}"/4096/0                 
                  1271 us         1271 us          549 
items_per_second=3.22388M/s null_percent=0 num_groups=4.096k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/10000        495 us          495 us         1389 
items_per_second=2.06983M/s null_percent=0.01 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/100          802 us          802 us          883 
items_per_second=1.27742M/s null_percent=1 num_groups=1.622k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/10          2523 us         2522 us          281 
items_per_second=405.964k/s null_percent=10 num_groups=6.011k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/2           6945 us         6942 us          110 
items_per_second=147.514k/s null_percent=50 num_groups=11.936k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/1            626 us          626 us          933 
items_per_second=1.63646M/s null_percent=100 num_groups=1 size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/1024/0            513 us          513 us         1392 
items_per_second=1.99517M/s null_percent=0 num_groups=1.024k size=1.024k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/10000       2359 us         2359 us          294 
items_per_second=1.73639M/s null_percent=0.01 num_groups=4.1k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/100         3889 us         3888 us          178 
items_per_second=1.05341M/s null_percent=1 num_groups=6.219k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/10         13720 us        13715 us           50 
items_per_second=298.661k/s null_percent=10 num_groups=24k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/2          26470 us        26462 us           24 
items_per_second=154.787k/s null_percent=50 num_groups=48.565k size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/1           2322 us         2322 us          302 
items_per_second=1.7641M/s null_percent=100 num_groups=1 size=4.096k
   GrouperWithMultiTypes/"{utf8, int32, int64, fixed_size_binary(128), 
boolean}"/4096/0           2070 us         2069 us          340 
items_per_second=1.97969M/s null_percent=0 num_groups=4.096k size=4.096k
   ```
   
   And there is a bug which would cause compare null with a wrong result 
https://github.com/apache/arrow/pull/40998  hasn't merged. So some of 
num_groups are wrong when `are_cols_in_encoding_order=true`.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to