lidavidm commented on a change in pull request #12484: URL: https://github.com/apache/arrow/pull/12484#discussion_r823887982
########## File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc ########## @@ -2758,6 +2771,343 @@ struct GroupedOneFactory { InputType argument_type; }; +// ---------------------------------------------------------------------- +// List implementation + +template <typename Type, typename Enable = void> +struct GroupedListImpl final : public GroupedAggregator { + using CType = typename TypeTraits<Type>::CType; + using GetSet = GroupedValueTraits<Type>; + + Status Init(ExecContext* ctx, const std::vector<ValueDescr>&, + const FunctionOptions* options) override { + ctx_ = ctx; + has_nulls_ = false; + // out_type_ initialized by GroupedListInit + values_ = TypedBufferBuilder<CType>(ctx_->memory_pool()); + groups_ = TypedBufferBuilder<uint32_t>(ctx_->memory_pool()); + values_bitmap_ = TypedBufferBuilder<bool>(ctx_->memory_pool()); + return Status::OK(); + } + + Status Resize(int64_t new_num_groups) override { + num_groups_ = new_num_groups; + return Status::OK(); + } + + Status Consume(const ExecBatch& batch) override { + int64_t num_values = batch[0].array()->length; + int64_t offset = batch[0].array()->offset; + + const auto* groups = batch[1].array()->GetValues<uint32_t>(1, 0); + RETURN_NOT_OK(groups_.Append(groups, num_values)); Review comment: What's not okay? ########## File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc ########## @@ -2758,6 +2771,343 @@ struct GroupedOneFactory { InputType argument_type; }; +// ---------------------------------------------------------------------- +// List implementation + +template <typename Type, typename Enable = void> +struct GroupedListImpl final : public GroupedAggregator { + using CType = typename TypeTraits<Type>::CType; + using GetSet = GroupedValueTraits<Type>; + + Status Init(ExecContext* ctx, const std::vector<ValueDescr>&, + const FunctionOptions* options) override { + ctx_ = ctx; + has_nulls_ = false; + // out_type_ initialized by GroupedListInit + values_ = TypedBufferBuilder<CType>(ctx_->memory_pool()); + groups_ = TypedBufferBuilder<uint32_t>(ctx_->memory_pool()); + values_bitmap_ = TypedBufferBuilder<bool>(ctx_->memory_pool()); + return Status::OK(); + } + + Status Resize(int64_t new_num_groups) override { + num_groups_ = new_num_groups; + return Status::OK(); + } + + Status Consume(const ExecBatch& batch) override { + int64_t num_values = batch[0].array()->length; + int64_t offset = batch[0].array()->offset; + + const auto* groups = batch[1].array()->GetValues<uint32_t>(1, 0); + RETURN_NOT_OK(groups_.Append(groups, num_values)); + + const auto* values = batch[0].array()->GetValues<CType>(1, 0); + RETURN_NOT_OK(GetSet::AppendBuffers(&values_, values, offset, num_values)); + + if (batch[0].null_count() > 0) { + if (!has_nulls_) { + has_nulls_ = true; + RETURN_NOT_OK(values_bitmap_.Append(num_args_, true)); + } + const auto* values_bitmap = batch[0].array()->GetValues<bool>(0, 0); Review comment: ```suggestion const auto* values_bitmap = batch[0].array()->GetValues<uint8_t>(0, 0); ``` ########## File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc ########## @@ -2758,6 +2771,343 @@ struct GroupedOneFactory { InputType argument_type; }; +// ---------------------------------------------------------------------- +// List implementation + +template <typename Type, typename Enable = void> +struct GroupedListImpl final : public GroupedAggregator { + using CType = typename TypeTraits<Type>::CType; + using GetSet = GroupedValueTraits<Type>; + + Status Init(ExecContext* ctx, const std::vector<ValueDescr>&, + const FunctionOptions* options) override { + ctx_ = ctx; + has_nulls_ = false; + // out_type_ initialized by GroupedListInit + values_ = TypedBufferBuilder<CType>(ctx_->memory_pool()); + groups_ = TypedBufferBuilder<uint32_t>(ctx_->memory_pool()); + values_bitmap_ = TypedBufferBuilder<bool>(ctx_->memory_pool()); + return Status::OK(); + } + + Status Resize(int64_t new_num_groups) override { + num_groups_ = new_num_groups; + return Status::OK(); + } + + Status Consume(const ExecBatch& batch) override { + int64_t num_values = batch[0].array()->length; + int64_t offset = batch[0].array()->offset; + + const auto* groups = batch[1].array()->GetValues<uint32_t>(1, 0); + RETURN_NOT_OK(groups_.Append(groups, num_values)); + + const auto* values = batch[0].array()->GetValues<CType>(1, 0); + RETURN_NOT_OK(GetSet::AppendBuffers(&values_, values, offset, num_values)); + + if (batch[0].null_count() > 0) { + if (!has_nulls_) { + has_nulls_ = true; + RETURN_NOT_OK(values_bitmap_.Append(num_args_, true)); + } + const auto* values_bitmap = batch[0].array()->GetValues<bool>(0, 0); + RETURN_NOT_OK(GroupedValueTraits<BooleanType>::AppendBuffers( + &values_bitmap_, values_bitmap, offset, num_values)); + } else if (has_nulls_) { + RETURN_NOT_OK(values_bitmap_.Append(num_values, true)); + } + num_args_ += num_values; + return Status::OK(); + } + + Status Merge(GroupedAggregator&& raw_other, + const ArrayData& group_id_mapping) override { + auto other = checked_cast<GroupedListImpl*>(&raw_other); + const auto* other_raw_groups = other->groups_.data(); + const auto* g = group_id_mapping.GetValues<uint32_t>(1); + + for (uint32_t other_g = 0; static_cast<int64_t>(other_g) < other->num_args_; + ++other_g) { + RETURN_NOT_OK(groups_.Append(g[other_raw_groups[other_g]])); + } + + const auto* values = reinterpret_cast<const CType*>(other->values_.data()); Review comment: Yes, GetSet::AppendBuffers should take uint8_t* uniformly, casting to `bool*` in one case is something we should avoid. ########## File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc ########## @@ -2758,6 +2771,343 @@ struct GroupedOneFactory { InputType argument_type; }; +// ---------------------------------------------------------------------- +// List implementation + +template <typename Type, typename Enable = void> +struct GroupedListImpl final : public GroupedAggregator { + using CType = typename TypeTraits<Type>::CType; + using GetSet = GroupedValueTraits<Type>; + + Status Init(ExecContext* ctx, const std::vector<ValueDescr>&, + const FunctionOptions* options) override { + ctx_ = ctx; + has_nulls_ = false; + // out_type_ initialized by GroupedListInit + values_ = TypedBufferBuilder<CType>(ctx_->memory_pool()); + groups_ = TypedBufferBuilder<uint32_t>(ctx_->memory_pool()); + values_bitmap_ = TypedBufferBuilder<bool>(ctx_->memory_pool()); + return Status::OK(); + } + + Status Resize(int64_t new_num_groups) override { + num_groups_ = new_num_groups; + return Status::OK(); + } + + Status Consume(const ExecBatch& batch) override { + int64_t num_values = batch[0].array()->length; + int64_t offset = batch[0].array()->offset; + + const auto* groups = batch[1].array()->GetValues<uint32_t>(1, 0); + RETURN_NOT_OK(groups_.Append(groups, num_values)); + + const auto* values = batch[0].array()->GetValues<CType>(1, 0); + RETURN_NOT_OK(GetSet::AppendBuffers(&values_, values, offset, num_values)); + + if (batch[0].null_count() > 0) { + if (!has_nulls_) { + has_nulls_ = true; + RETURN_NOT_OK(values_bitmap_.Append(num_args_, true)); + } + const auto* values_bitmap = batch[0].array()->GetValues<bool>(0, 0); + RETURN_NOT_OK(GroupedValueTraits<BooleanType>::AppendBuffers( + &values_bitmap_, values_bitmap, offset, num_values)); + } else if (has_nulls_) { + RETURN_NOT_OK(values_bitmap_.Append(num_values, true)); + } + num_args_ += num_values; + return Status::OK(); + } + + Status Merge(GroupedAggregator&& raw_other, + const ArrayData& group_id_mapping) override { + auto other = checked_cast<GroupedListImpl*>(&raw_other); + const auto* other_raw_groups = other->groups_.data(); + const auto* g = group_id_mapping.GetValues<uint32_t>(1); + + for (uint32_t other_g = 0; static_cast<int64_t>(other_g) < other->num_args_; + ++other_g) { + RETURN_NOT_OK(groups_.Append(g[other_raw_groups[other_g]])); + } + + const auto* values = reinterpret_cast<const CType*>(other->values_.data()); Review comment: Note you'll have to take the type as well, so you can compute the bit/byte width. That has the added bonus, however, that you _should_ be able to use it for FixedSizeBinaryType now too. (Though, it may be tricky to get everything else to line up in the templates.) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org