alamb commented on code in PR #15924:
URL: https://github.com/apache/datafusion/pull/15924#discussion_r2084924225
##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -3435,49 +3435,80 @@ impl ScalarValue {
.sum::<usize>()
}
- /// Performs a deep clone of the ScalarValue, creating new copies of all
nested data structures.
- /// This is different from the standard `clone()` which may share data
through `Arc`.
- /// Aggregation functions like `max` will cost a lot of memory if the data
is not cloned.
- pub fn force_clone(&self) -> Self {
+ /// Compacts the allocation referenced by `self` to the minimum, copying
the data if
Review Comment:
👍
##########
datafusion/functions-aggregate/src/min_max.rs:
##########
@@ -645,19 +645,29 @@ fn min_max_batch_struct(array: &ArrayRef, ordering:
Ordering) -> Result<ScalarVa
}
}
}
- // use force_clone to free array reference
- Ok(extreme.force_clone())
+
+ Ok(extreme)
}
macro_rules! min_max_struct {
($VALUE:expr, $DELTA:expr, $OP:ident) => {{
if $VALUE.is_null() {
- $DELTA.clone()
+ let mut delta_copy = $DELTA.clone();
+ // When the new value won we want to compact it to
+ // avoid storing the entire input
+ delta_copy.compact();
Review Comment:
it would be great to try and avoid the duplication in these min/max macros
anyways (and instead use an Array) -- no changes are neede din this PR, I am
just observing
##########
datafusion/functions-aggregate/src/first_last.rs:
##########
@@ -1226,9 +1232,13 @@ impl LastValueAccumulator {
}
// Updates state with the values in the given row.
- fn update_with_new_row(&mut self, row: &[ScalarValue]) {
- self.last = row[0].clone();
- self.orderings = row[1..].to_vec();
+ fn update_with_new_row(&mut self, mut row: Vec<ScalarValue>) {
+ row.iter_mut().for_each(|s| {
Review Comment:
```suggestion
// Ensure any Array based scalars hold have a single value to reduce
memory pressure
row.iter_mut().for_each(|s| {
```
##########
datafusion/functions-aggregate/src/first_last.rs:
##########
@@ -1772,4 +1790,60 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_first_list_acc_size() -> Result<()> {
+ fn size_after_batch(values: &[ArrayRef]) -> Result<usize> {
+ let mut first_accumulator = FirstValueAccumulator::try_new(
+
&DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
+ &[],
+ LexOrdering::default(),
+ false,
+ )?;
+
+ first_accumulator.update_batch(values)?;
+
+ Ok(first_accumulator.size())
+ }
+
+ let batch1 = ListArray::from_iter_primitive::<Int32Type, _, _>(
+ repeat_with(|| Some(vec![Some(1)])).take(10000),
+ );
+ let batch2 =
+ ListArray::from_iter_primitive::<Int32Type, _,
_>([Some(vec![Some(1)])]);
+
+ let size1 = size_after_batch(&[Arc::new(batch1)])?;
+ let size2 = size_after_batch(&[Arc::new(batch2)])?;
+ assert_eq!(size1, size2);
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_last_list_acc_size() -> Result<()> {
+ fn size_after_batch(values: &[ArrayRef]) -> Result<usize> {
+ let mut last_accumulator = FirstValueAccumulator::try_new(
Review Comment:
I think this is the wrong accumulator:
```suggestion
let mut last_accumulator = LastValueAccumulator::try_new(
```
##########
datafusion/functions-aggregate/src/first_last.rs:
##########
@@ -827,9 +827,13 @@ impl FirstValueAccumulator {
}
// Updates state with the values in the given row.
- fn update_with_new_row(&mut self, row: &[ScalarValue]) {
- self.first = row[0].clone();
- self.orderings = row[1..].to_vec();
+ fn update_with_new_row(&mut self, mut row: Vec<ScalarValue>) {
+ row.iter_mut().for_each(|s| {
Review Comment:
```suggestion
// Ensure any Array based scalars hold have a single value to reduce
memory pressure
row.iter_mut().for_each(|s| {
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]