alamb commented on code in PR #19625:
URL: https://github.com/apache/datafusion/pull/19625#discussion_r2672571468
##########
datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs:
##########
@@ -107,10 +165,16 @@ impl NullState {
T: ArrowPrimitiveType + Send,
F: FnMut(usize, T::Native) + Send,
{
- // ensure the seen_values is big enough (start everything at
- // "not seen" valid)
- let seen_values =
- initialize_builder(&mut self.seen_values, total_num_groups, false);
+ if let SeenValues::All { num_values } = &mut self.seen_values
Review Comment:
```suggestion
// skip null handling if no nulls in input or accumulator
if let SeenValues::All { num_values } = &mut self.seen_values
```
##########
datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs:
##########
@@ -211,21 +285,45 @@ impl NullState {
/// for the `emit_to` rows.
///
/// resets the internal state appropriately
- pub fn build(&mut self, emit_to: EmitTo) -> NullBuffer {
- let nulls: BooleanBuffer = self.seen_values.finish();
-
- let nulls = match emit_to {
- EmitTo::All => nulls,
- EmitTo::First(n) => {
- // split off the first N values in seen_values
- let first_n_null: BooleanBuffer = nulls.slice(0, n);
- // reset the existing seen buffer
- self.seen_values
- .append_buffer(&nulls.slice(n, nulls.len() - n));
- first_n_null
+ pub fn build(&mut self, emit_to: EmitTo) -> Option<NullBuffer> {
+ match emit_to {
+ EmitTo::All => {
+ let old_seen = std::mem::replace(
+ &mut self.seen_values,
+ SeenValues::All { num_values: 0 },
+ );
+ match old_seen {
+ SeenValues::All { .. } => None,
+ SeenValues::Some { mut values } => {
+ Some(NullBuffer::new(values.finish()))
+ }
+ }
}
- };
- NullBuffer::new(nulls)
+ EmitTo::First(n) => match &mut self.seen_values {
+ SeenValues::All { num_values } => {
+ *num_values = num_values.saturating_sub(n);
+ None
+ }
+ SeenValues::Some { .. } => {
+ let mut old_values = match std::mem::replace(
+ &mut self.seen_values,
+ SeenValues::All { num_values: 0 },
Review Comment:
It might be nicer if you implemented SeenValues::new() or maybe even
SeenValues::default
SO this couls look like
```rust
let mut old_values = match std::mem::take(&mut self.seen_values);
```
##########
datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs:
##########
@@ -106,7 +106,8 @@ where
opt_filter,
total_num_groups,
|group_index, new_value| {
- let value = &mut self.values[group_index];
+ // SAFETY: group_index is guaranteed to be in bounds
Review Comment:
I recommend adding safety notes to the docs of GroupsAccumulator in
https://github.com/apache/datafusion/blob/36ec9f1de0aeabca60b8f7ebe07d650b8ef03506/datafusion/expr-common/src/groups_accumulator.rs#L114-L113
That explains that all group indexes are guaranteed to be <=
`total_num_groups` and that can be relied on for safety
##########
datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs:
##########
@@ -107,10 +165,16 @@ impl NullState {
T: ArrowPrimitiveType + Send,
F: FnMut(usize, T::Native) + Send,
{
- // ensure the seen_values is big enough (start everything at
- // "not seen" valid)
- let seen_values =
- initialize_builder(&mut self.seen_values, total_num_groups, false);
+ if let SeenValues::All { num_values } = &mut self.seen_values
Review Comment:
One possibility is to make this another another function to reduce some
duplication and have a place it could be explained
Maybe something like
```rust
if let Some(num_values) = self.all_values_mut() && opt_filter.is_none &&
values.null_count == 0 {
...
}
```
?
Though maybe an extra level of indirection would make it harder to follow.
##########
datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs:
##########
@@ -140,10 +204,20 @@ impl NullState {
let data = values.values();
assert_eq!(data.len(), group_indices.len());
- // ensure the seen_values is big enough (start everything at
- // "not seen" valid)
- let seen_values =
- initialize_builder(&mut self.seen_values, total_num_groups, false);
+ if let SeenValues::All { num_values } = &mut self.seen_values
Review Comment:
```suggestion
// skip null handling if no nulls in input or accumulator
if let SeenValues::All { num_values } = &mut self.seen_values
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]