Samyak2 commented on code in PR #22862: URL: https://github.com/apache/datafusion/pull/22862#discussion_r3385300135
########## datafusion/common/src/utils/memory.rs: ########## @@ -131,34 +131,75 @@ pub fn estimate_memory_size<T>(num_elements: usize, fixed_size: usize) -> Result /// `Buffer`. This method provides temporary fix until the issue is resolved: /// <https://github.com/apache/arrow-rs/issues/6439> pub fn get_record_batch_memory_size(batch: &RecordBatch) -> usize { - // Store pointers to `Buffer`'s start memory address (instead of actual - // used data region's pointer represented by current `Array`) - let mut counted_buffers: HashSet<NonNull<u8>> = HashSet::new(); - let mut total_size = 0; - - for array in batch.columns() { - let array_data = array.to_data(); - count_array_data_memory_size(&array_data, &mut counted_buffers, &mut total_size); + RecordBatchMemoryCounter::new().count_batch(batch) +} + +/// Tracks the memory used by a sequence of [`RecordBatch`]es that may share +/// underlying buffers, counting each buffer exactly once. +/// +/// Use this instead of [`get_record_batch_memory_size`] to account for the +/// total memory of a sequence of batches, e.g. when buffering the batches of +/// an input stream. Such batches can share buffers (for example, operators +/// like aggregates emit one large batch as multiple zero-copy slices), and +/// calling [`get_record_batch_memory_size`] per batch counts the shared +/// buffers once per batch, while this counter counts them exactly once. A +/// batch's buffers are kept alive by the batch even when only a sub-range is +/// referenced, so counting unique buffers in full reflects the memory the +/// batches actually retain. +#[derive(Debug, Default)] +pub struct RecordBatchMemoryCounter { + /// Start addresses of `Buffer`s that have already been counted (instead of + /// actual used data region's pointer represented by current `Array`) + counted_buffers: HashSet<usize>, Review Comment: We could use `NonZero<usize>` instead and then use `as_ptr().addr()` below. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
