icexelloss commented on code in PR #34311:
URL: https://github.com/apache/arrow/pull/34311#discussion_r1128114556
##########
cpp/src/arrow/compute/kernels/hash_aggregate_test.cc:
##########
@@ -135,22 +142,96 @@ Result<Datum> NaiveGroupBy(std::vector<Datum> arguments,
std::vector<Datum> keys
return Take(struct_arr, sorted_indices);
}
+Result<Datum> MakeGroupByOutput(const std::vector<ExecBatch>& output_batches,
+ const std::shared_ptr<Schema> output_schema,
+ size_t num_aggregates, size_t num_keys, bool
naive) {
+ ArrayVector out_arrays(num_aggregates + num_keys);
+ for (size_t i = 0; i < out_arrays.size(); ++i) {
+ std::vector<std::shared_ptr<Array>> arrays(output_batches.size());
+ for (size_t j = 0; j < output_batches.size(); ++j) {
+ arrays[j] = output_batches[j].values[i].make_array();
+ }
+ if (arrays.empty()) {
+ ARROW_ASSIGN_OR_RAISE(
+ out_arrays[i],
+ MakeArrayOfNull(output_schema->field(static_cast<int>(i))->type(),
+ /*length=*/0));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(out_arrays[i], Concatenate(arrays));
+ }
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Array> struct_arr,
+ StructArray::Make(std::move(out_arrays), output_schema->fields()));
+
+ bool need_sort = !naive;
+ for (size_t i = num_aggregates; need_sort && i < out_arrays.size(); i++) {
+ if (output_schema->field(static_cast<int>(i))->type()->id() ==
Type::DICTIONARY) {
+ need_sort = false;
+ }
+ }
+ if (!need_sort) {
+ return struct_arr;
+ }
+
+ // The exec plan may reorder the output rows. The tests are all setup to
expect ouptut
+ // in ascending order of keys. So we need to sort the result by the key
columns. To do
+ // that we create a table using the key columns, calculate the sort indices
from that
+ // table (sorting on all fields) and then use those indices to calculate our
result.
+ std::vector<std::shared_ptr<Field>> key_fields;
+ std::vector<std::shared_ptr<Array>> key_columns;
+ std::vector<SortKey> sort_keys;
+ for (std::size_t i = 0; i < num_keys; i++) {
+ const std::shared_ptr<Array>& arr = out_arrays[i + num_aggregates];
+ key_columns.push_back(arr);
+ key_fields.push_back(field("name_does_not_matter", arr->type()));
+ sort_keys.emplace_back(static_cast<int>(i));
+ }
+ std::shared_ptr<Schema> key_schema = schema(std::move(key_fields));
+ std::shared_ptr<Table> key_table = Table::Make(std::move(key_schema),
key_columns);
+ SortOptions sort_options(std::move(sort_keys));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> sort_indices,
+ SortIndices(key_table, sort_options));
+
+ return Take(struct_arr, sort_indices);
+}
+
Result<Datum> RunGroupBy(const BatchesWithSchema& input,
const std::vector<std::string>& key_names,
- const std::vector<Aggregate>& aggregates, bool
use_threads) {
+ const std::vector<std::string>& segment_key_names,
+ const std::vector<Aggregate>& aggregates,
ExecContext* ctx,
+ bool use_threads, bool segmented = false, bool naive
= false) {
+ // When segment_keys is non-empty the `segmented` flag is always true;
otherwise (when
+ // empty), it may still be set to true. In this case, the tester
restructures (without
+ // changing the data of) the result of RunGroupBy from
`std::vector<ExecBatch>`
+ // (output_batches) to `std::vector<ArrayVector>` (out_arrays), which have
the structure
+ // typical of the case of a non-empty segment_keys (with multiple arrays per
column, one
+ // array per segment) but only one array per column (because, technically,
there is only
+ // one segment in this case). Thus, this case focuses on the structure of
the result.
+ //
+ // The `naive` flag means that the output is expected to be like that of
`NaiveGroupBy`,
+ // which in particular doesn't require sorting. The reason for the naive
flag is that
Review Comment:
"doesn't require sorting" means that the output of `NaiveGroupBy` is
unsorted or does this mean sth else?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]