dhruv9vats commented on a change in pull request #12484:
URL: https://github.com/apache/arrow/pull/12484#discussion_r822454763
##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -2758,6 +2758,317 @@ struct GroupedOneFactory {
InputType argument_type;
};
+// ----------------------------------------------------------------------
+// List implementation
+
+template <typename Type, typename Enable = void>
+struct GroupedListImpl final : public GroupedAggregator {
+ using CType = typename TypeTraits<Type>::CType;
+ using GetSet = GroupedValueTraits<Type>;
+
+ Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
+ const FunctionOptions* options) override {
+ ctx_ = ctx;
+ // out_type_ initialized by GroupedListInit
+ values_ = TypedBufferBuilder<CType>(ctx_->memory_pool());
+ groups_ = TypedBufferBuilder<uint32_t>(ctx_->memory_pool());
+ values_bitmap_ = TypedBufferBuilder<bool>(ctx_->memory_pool());
+ return Status::OK();
+ }
+
+ Status Resize(int64_t new_num_groups) override {
+ num_groups_ = new_num_groups;
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ const auto* groups = batch[1].array()->GetValues<uint32_t>(1);
Review comment:
Can you elaborate more on the assumption you suggest here? Because say
for example:
```cpp
auto in_schema = schema({
field("floats", float64()),
// field("nulls", null()),
field("booleans", boolean()),
field("decimal128", decimal128(3, 2)),
field("decimal256", decimal256(3, 2)),
field("fixed_binary", fixed_size_binary(3)),
field("key", int64()),
});
auto table = TableFromJSON(in_schema, {R"([
[null, true, null, null, null, 1],
[1.0, true, "1.01", "1.01", "aaa", 1]
])",
R"([
[0.0, false, "0.00", "0.00", "bac", 2],
[null, false, null, null, null, 3],
[4.0, null, "4.01", "4.01", "234", null],
[3.25, true, "3.25", "3.25", "ddd", 1],
[0.125, false, "0.12", "0.12", "bcd", 2]
])",
R"([
[-0.25, false, "-0.25", "-0.25", "bab", 2],
[0.75, true, "0.75", "0.75", "123", null],
[null, true, null, null, null, 3]
])"});
```
The 2nd batch of the above table on my local machine is sometimes received
in chunks as:
```cpp
// with offset 0
[0.0, false, "0.00", "0.00", "bac", 2],
[null, false, null, null, null, 3],
```
```cpp
// with offset 2
[4.0, null, "4.01", "4.01", "234", null],
[3.25, true, "3.25", "3.25", "ddd", 1],
```
```cpp
// with offset 4
[0.125, false, "0.12", "0.12", "bcd", 2]
```
And the kernel works correctly _without_ DCHECK (and of course fails with
it).
##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -630,6 +635,12 @@ struct GroupedValueTraits<BooleanType> {
static void Set(uint8_t* values, uint32_t g, bool v) {
bit_util::SetBitTo(values, g, v);
}
+ static Status AppendBuffers(TypedBufferBuilder<bool>& destination, const
bool* values,
+ int64_t num_values) {
+ RETURN_NOT_OK(destination.Reserve(num_values));
+ destination.UnsafeAppend(reinterpret_cast<const uint8_t*>(values), 0,
num_values);
+ return Status::OK();
+ }
Review comment:
Is this similar to what you had in mind?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]