lidavidm commented on a change in pull request #12484:
URL: https://github.com/apache/arrow/pull/12484#discussion_r824820358



##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -3129,6 +3537,9 @@ const FunctionDoc hash_one_doc{"Get one value from each 
group",
                                ("Null values are also returned."),
                                {"array", "group_id_array"}};
 
+const FunctionDoc hash_list_doc{"List array of all groups",

Review comment:
       "List all values in each group"?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -621,6 +622,12 @@ struct GroupedValueTraits {
 
   static CType Get(const CType* values, uint32_t g) { return values[g]; }
   static void Set(CType* values, uint32_t g, CType v) { values[g] = v; }
+  static Status AppendBuffers(TypedBufferBuilder<CType>* destination,
+                              const uint8_t* values, int64_t offset, int64_t 
num_values) {
+    RETURN_NOT_OK(
+        destination->Append(reinterpret_cast<const CType*>(values) + offset, 
num_values));

Review comment:
       This is why I mentioned passing the actual type here as well `const 
DataType& type` since you can use `FixedWidthType::bit_width` to compute the 
width (I believe there's a byte_width in the class hierarchy as well)

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -2758,6 +2771,343 @@ struct GroupedOneFactory {
   InputType argument_type;
 };
 
+// ----------------------------------------------------------------------
+// List implementation
+
+template <typename Type, typename Enable = void>
+struct GroupedListImpl final : public GroupedAggregator {
+  using CType = typename TypeTraits<Type>::CType;
+  using GetSet = GroupedValueTraits<Type>;
+
+  Status Init(ExecContext* ctx, const std::vector<ValueDescr>&,
+              const FunctionOptions* options) override {
+    ctx_ = ctx;
+    has_nulls_ = false;
+    // out_type_ initialized by GroupedListInit
+    values_ = TypedBufferBuilder<CType>(ctx_->memory_pool());
+    groups_ = TypedBufferBuilder<uint32_t>(ctx_->memory_pool());
+    values_bitmap_ = TypedBufferBuilder<bool>(ctx_->memory_pool());
+    return Status::OK();
+  }
+
+  Status Resize(int64_t new_num_groups) override {
+    num_groups_ = new_num_groups;
+    return Status::OK();
+  }
+
+  Status Consume(const ExecBatch& batch) override {
+    int64_t num_values = batch[0].array()->length;
+    int64_t offset = batch[0].array()->offset;
+
+    const auto* groups = batch[1].array()->GetValues<uint32_t>(1, 0);
+    RETURN_NOT_OK(groups_.Append(groups, num_values));

Review comment:
       I assume the grouper never generates a group ID array with an offset, we 
can add a DCHECK to enforce this

##########
File path: docs/source/cpp/compute.rst
##########
@@ -334,25 +334,27 @@ equivalents above and reflects how they are implemented 
internally.
 
+-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
 | hash_distinct           | Unary | Any                                | Input 
type             | :struct:`CountOptions`           | \(2)  |
 
+-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_list               | Unary | Any                                | List 
value type        |                                  | \(3)  |

Review comment:
       Maybe return "List of input type" to be clear

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -621,6 +622,12 @@ struct GroupedValueTraits {
 
   static CType Get(const CType* values, uint32_t g) { return values[g]; }
   static void Set(CType* values, uint32_t g, CType v) { values[g] = v; }
+  static Status AppendBuffers(TypedBufferBuilder<CType>* destination,
+                              const uint8_t* values, int64_t offset, int64_t 
num_values) {
+    RETURN_NOT_OK(
+        destination->Append(reinterpret_cast<const CType*>(values) + offset, 
num_values));

Review comment:
       You can look here for inspiration: 
https://github.com/apache/arrow/blob/master/cpp/src/arrow/compute/kernels/copy_data_internal.h
   
   …and frankly, you can consider just using those utilities directly! I forgot 
about that




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to