[GitHub] [arrow] icexelloss commented on a diff in pull request #34311: GH-32884: [C++] Add ordered aggregation

via GitHub Tue, 07 Mar 2023 08:04:26 -0800


icexelloss commented on code in PR #34311:
URL: https://github.com/apache/arrow/pull/34311#discussion_r1128114556



##########
cpp/src/arrow/compute/kernels/hash_aggregate_test.cc:
##########
@@ -135,22 +142,96 @@ Result<Datum> NaiveGroupBy(std::vector<Datum> arguments, 
std::vector<Datum> keys
   return Take(struct_arr, sorted_indices);
 }
 
+Result<Datum> MakeGroupByOutput(const std::vector<ExecBatch>& output_batches,
+                                const std::shared_ptr<Schema> output_schema,
+                                size_t num_aggregates, size_t num_keys, bool 
naive) {
+  ArrayVector out_arrays(num_aggregates + num_keys);
+  for (size_t i = 0; i < out_arrays.size(); ++i) {
+    std::vector<std::shared_ptr<Array>> arrays(output_batches.size());
+    for (size_t j = 0; j < output_batches.size(); ++j) {
+      arrays[j] = output_batches[j].values[i].make_array();
+    }
+    if (arrays.empty()) {
+      ARROW_ASSIGN_OR_RAISE(
+          out_arrays[i],
+          MakeArrayOfNull(output_schema->field(static_cast<int>(i))->type(),
+                          /*length=*/0));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(out_arrays[i], Concatenate(arrays));
+    }
+  }
+
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Array> struct_arr,
+      StructArray::Make(std::move(out_arrays), output_schema->fields()));
+
+  bool need_sort = !naive;
+  for (size_t i = num_aggregates; need_sort && i < out_arrays.size(); i++) {
+    if (output_schema->field(static_cast<int>(i))->type()->id() == 
Type::DICTIONARY) {
+      need_sort = false;
+    }
+  }
+  if (!need_sort) {
+    return struct_arr;
+  }
+
+  // The exec plan may reorder the output rows.  The tests are all setup to 
expect ouptut
+  // in ascending order of keys.  So we need to sort the result by the key 
columns.  To do
+  // that we create a table using the key columns, calculate the sort indices 
from that
+  // table (sorting on all fields) and then use those indices to calculate our 
result.
+  std::vector<std::shared_ptr<Field>> key_fields;
+  std::vector<std::shared_ptr<Array>> key_columns;
+  std::vector<SortKey> sort_keys;
+  for (std::size_t i = 0; i < num_keys; i++) {
+    const std::shared_ptr<Array>& arr = out_arrays[i + num_aggregates];
+    key_columns.push_back(arr);
+    key_fields.push_back(field("name_does_not_matter", arr->type()));
+    sort_keys.emplace_back(static_cast<int>(i));
+  }
+  std::shared_ptr<Schema> key_schema = schema(std::move(key_fields));
+  std::shared_ptr<Table> key_table = Table::Make(std::move(key_schema), 
key_columns);
+  SortOptions sort_options(std::move(sort_keys));
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> sort_indices,
+                        SortIndices(key_table, sort_options));
+
+  return Take(struct_arr, sort_indices);
+}
+
 Result<Datum> RunGroupBy(const BatchesWithSchema& input,
                          const std::vector<std::string>& key_names,
-                         const std::vector<Aggregate>& aggregates, bool 
use_threads) {
+                         const std::vector<std::string>& segment_key_names,
+                         const std::vector<Aggregate>& aggregates, 
ExecContext* ctx,
+                         bool use_threads, bool segmented = false, bool naive 
= false) {
+  // When segment_keys is non-empty the `segmented` flag is always true; 
otherwise (when
+  // empty), it may still be set to true. In this case, the tester 
restructures (without
+  // changing the data of) the result of RunGroupBy from 
`std::vector<ExecBatch>`
+  // (output_batches) to `std::vector<ArrayVector>` (out_arrays), which have 
the structure
+  // typical of the case of a non-empty segment_keys (with multiple arrays per 
column, one
+  // array per segment) but only one array per column (because, technically, 
there is only
+  // one segment in this case). Thus, this case focuses on the structure of 
the result.
+  //
+  // The `naive` flag means that the output is expected to be like that of 
`NaiveGroupBy`,
+  // which in particular doesn't require sorting. The reason for the naive 
flag is that

Review Comment:
   "doesn't require sorting" means that the output of `NaiveGroupBy` is 
unsorted or does this mean sth else?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] icexelloss commented on a diff in pull request #34311: GH-32884: [C++] Add ordered aggregation

Reply via email to