Re: [PR] feat: implement GroupArrayAggAccumulator attempt 3 [datafusion]

via GitHub Fri, 17 Oct 2025 13:54:08 -0700


alamb commented on code in PR #17915:
URL: https://github.com/apache/datafusion/pull/17915#discussion_r2411896875



##########
datafusion/functions-aggregate-common/src/aggregate/array_agg.rs:
##########
@@ -0,0 +1,491 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Dedicated implementation of `GroupsAccumulator` for `array_agg`
+
+use std::iter::repeat_n;
+use std::mem;
+use std::sync::Arc;
+
+use arrow::array::{new_empty_array, Array, GenericListArray};
+use arrow::array::{ArrayRef, AsArray, BooleanArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::compute::kernels;
+use arrow::datatypes::{ArrowNativeType, Field};
+use datafusion_common::{internal_datafusion_err, Result};
+use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
+
+#[derive(Default, Clone)]
+pub struct AggGroupAccumulator {
+    // [1,2,3] [4,5,6]
+    stacked_batches: Vec<ArrayRef>,
+    // address items of each group within the stacked_batches
+    // this is maintained to perform kernel::interleave
+    stacked_group_indices: Vec<(
+        /*group_number*/ usize,
+        /*array_number*/ usize,
+        /*offset_in_array*/ usize,
+    )>,
+    indice_sorted: bool,
+    max_group: usize,
+}
+
+impl AggGroupAccumulator {
+    pub fn new() -> Self {
+        Self {
+            stacked_batches: vec![],
+            stacked_group_indices: vec![],
+            indice_sorted: false,
+            max_group: 0,
+        }
+    }
+    fn consume_stacked_batches(
+        &mut self,
+        emit_to: EmitTo,
+    ) -> Result<GenericListArray<i32>> {
+        // in the case of continous calls to function `evaluate` happen,
+        // (without any interleaving calls to `merge_batch` or `update_batch`)
+        // the first call will basically sort everything beforehand
+        // so the second one does not need to
+        if !self.indice_sorted {
+            self.indice_sorted = true;
+            self.stacked_group_indices.sort_by_key(|a| {
+                // TODO: array_agg with distinct and custom order can be 
implemented here
+                a.0
+            });
+        }
+
+        let mut current_group = self.stacked_group_indices[0].0;
+
+        // this is inclusive, zero-based
+        let stop_at_group = match emit_to {
+            EmitTo::All => self.max_group-1,
+            EmitTo::First(groups_taken) => groups_taken-1,
+        };
+        let mut group_windows =
+            Vec::<i32>::with_capacity(self.max_group.min(stop_at_group) + 1);
+        group_windows.push(0);
+        let mut split_offset = None;
+
+        // TODO: init with a good cap if possible via some stats during 
accumulation phase
+        let mut interleave_offsets = vec![];
+        for (offset, (group_index, array_number, offset_in_array)) in
+            self.stacked_group_indices.iter().enumerate()
+        {
+            if *group_index > stop_at_group {
+                split_offset = Some(offset);
+                break;
+            }
+            if *group_index > current_group {
+                current_group = *group_index;
+                group_windows.push(offset as i32);
+            }
+            interleave_offsets.push((*array_number, *offset_in_array));
+        }
+        if let Some(split_offset) = split_offset {
+            let mut tail_part = 
self.stacked_group_indices.split_off(split_offset);
+            mem::swap(&mut self.stacked_group_indices, &mut tail_part);
+            for item in self.stacked_group_indices.iter_mut() {
+                // shift down the number of group being taken
+                item.0 -= (stop_at_group+1)
+            }
+
+            group_windows.push(split_offset as i32);
+        } else {
+            group_windows.push(self.stacked_group_indices.len() as i32);
+            mem::take(&mut self.stacked_group_indices);
+        };
+
+        let stacked_batches = self
+            .stacked_batches
+            .iter()
+            .map(|a| a.as_ref())
+            .collect::<Vec<_>>();
+
+        let offsets_buffer = OffsetBuffer::new(group_windows.into());
+
+        // group indices like [1,1,1,2,2,2]
+        // backend_array like [a,b,c,d,e,f]
+        // offsets should be: [0,3,6]
+        // then result should be [a,b,c], [d,e,f]
+
+        // backend_array is a flatten list of individual values before 
aggregation
+        let backend_array =
+            kernels::interleave::interleave(&stacked_batches, 
&interleave_offsets)?;
+        let dt = backend_array.data_type();
+        let field = Arc::new(Field::new_list_field(dt.clone(), true));
+
+        let arr =
+            GenericListArray::<i32>::new(field, offsets_buffer, backend_array, 
None);
+        Ok(arr)
+    }
+}
+
+impl GroupsAccumulator for AggGroupAccumulator {
+    // given the stacked_batch as:
+    // - batch1 [1,4,5,6,7]
+    // - batch2 [5,1,1,1,1]
+
+    // and group_indices as
+    // indices g1: [(0,0), (1,1), (1,2) ...]
+    // indices g2: []
+    // indices g3: []
+    // indices g4: [(0,1)]
+    // each tuple represents (batch_index, and offset within the batch index)
+    // for example
+    // - (0,0) means the 0th item inside batch1, which is `1`
+    // - (1,1) means the 1th item inside batch2, which is `1`
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        let singular_col = values
+            .first()
+            .ok_or(internal_datafusion_err!("invalid agg input"))?;
+
+        self.stacked_batches.push(Arc::clone(singular_col));
+        let batch_index = self.stacked_batches.len() - 1;
+
+        if let Some(filter) = opt_filter {
+            for (array_offset, (group_index, filter_value)) in
+                group_indices.iter().zip(filter.iter()).enumerate()
+            {
+                if let Some(true) = filter_value {
+                    self.stacked_group_indices.push((
+                        *group_index,
+                        batch_index,
+                        array_offset,
+                    ));
+                }
+            }
+        } else {
+            for (array_offset, group_index) in 
group_indices.iter().enumerate() {
+                self.stacked_group_indices.push((
+                    *group_index,
+                    batch_index,
+                    array_offset,
+                ));
+            }
+        }
+        self.indice_sorted = false;
+        self.max_group = self.max_group.max(total_num_groups);
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let arr = self.consume_stacked_batches(emit_to)?;
+        Ok(Arc::new(arr) as ArrayRef)
+    }
+
+    // filtered_null_mask(opt_filter, &values);
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        Ok(vec![self.evaluate(emit_to)?])
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        // for merge_batch which happens at final stage
+        // opt_filter will always be none
+        _opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        let singular_col = values
+            .first()
+            .ok_or(internal_datafusion_err!("invalid agg input"))?;
+        let list_arr = singular_col.as_list::<i32>();
+        let new_array_number = self.stacked_batches.len();
+        // TODO: the backed_arr contains redundant data
+        // make sure that flatten_group_index has the same length with 
backed_arr
+        let flatten_group_index =
+            group_indices
+                .iter()
+                .enumerate()
+                .flat_map(|(row, group_index)| {
+                    let end = list_arr.value_offsets()[row + 1].as_usize();
+                    let start = list_arr.value_offsets()[row].as_usize();
+                    (start..end).map(|offset| (*group_index, new_array_number, 
offset))
+                });
+        self.stacked_group_indices.extend(flatten_group_index);
+
+        let backed_arr = list_arr.values();
+        self.stacked_batches.push(Arc::clone(backed_arr));
+        self.indice_sorted = false;
+        self.max_group = self.max_group.max(total_num_groups);
+        Ok(())
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)

Review Comment:
   > some error on memory reservation, i'm not sure if the calculation if `fn 
size` is wrong, but as i understand, we only account for the buffer newly 
created by the implementation, not the buffer we borrowed somewhere (i.e the 
stacked ArrayRef everytime we receive from `fn merge_batch` or `fn 
update_batch`. Maybe it's a good chance for me learn how mem 
reservation/spilling works
   
   I took a quick look at this code -- One thing we probably need to account 
for is the memory in the held ArrayRefs -- specifically by calling 
https://docs.rs/arrow/latest/arrow/array/trait.Array.html#tymethod.get_array_memory_size
 on all the stacked arrays
   
   
   
   However,  I bet the fuzz test failure is due to actually better accounting 
of memory. Maybe we need to readjust the parameters or something
   
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: implement GroupArrayAggAccumulator attempt 3 [datafusion]

Reply via email to