This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 0ded0ce1be Account for child `Bucket` size in OrderPreservingInterner
(#4646)
0ded0ce1be is described below
commit 0ded0ce1be928ac8a74ce8e791febda95e01c05e
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Aug 8 15:35:19 2023 -0500
Account for child `Bucket` size in OrderPreservingInterner (#4646)
* Account for child buckets in OrderPreservingInterner
* Add a test
* Tweak
* clipy
---
arrow-row/src/interner.rs | 93 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 93 insertions(+)
diff --git a/arrow-row/src/interner.rs b/arrow-row/src/interner.rs
index 1c71b6a552..fde9251952 100644
--- a/arrow-row/src/interner.rs
+++ b/arrow-row/src/interner.rs
@@ -343,8 +343,19 @@ impl Bucket {
fn size(&self) -> usize {
std::mem::size_of::<Self>()
+ self.slots.capacity() * std::mem::size_of::<Slot>()
+ // and account for the size of any embedded buckets in the slots
+ + self.slot_child_bucket_size()
+ self.next.as_ref().map(|x| x.size()).unwrap_or_default()
}
+
+ /// returns the total size of any recursively allocated `Bucket`s
+ /// in self.slots. This does not include the size of the child Slot itself
+ fn slot_child_bucket_size(&self) -> usize {
+ self.slots
+ .iter()
+ .map(|slot| slot.child.as_ref().map(|x|
x.size()).unwrap_or_default())
+ .sum()
+ }
}
#[cfg(test)]
@@ -427,4 +438,86 @@ mod tests {
interner.normalized_key(interned[3]) <
interner.normalized_key(interned[2])
);
}
+
+ #[test]
+ fn test_intern_sizes() {
+ let mut interner = OrderPreservingInterner::default();
+
+ // Intern a 1K values each 8 bytes large
+ let num_items = 1000;
+ let mut values: Vec<usize> = (0..num_items).collect();
+ values.reverse();
+
+ // intern these values 1 at a time (otherwise the interner
+ // will sort them first);
+ for v in values {
+ interner.intern([Some(v.to_be_bytes())]);
+ }
+
+ let reported = interner.size();
+
+ // Figure out the expected size (this is a second
+ // implementation of size()) as a double check
+ let min_expected = BucketWalker::new()
+ .visit_bucket(interner.bucket.as_ref())
+ .memory_estimate()
+ // hash table size
+ + interner.lookup.capacity() * std::mem::size_of::<Interned>()
+ // key/value storage
+ + interner.keys.buffer_size()
+ + interner.values.buffer_size();
+
+ assert!(
+ reported > min_expected,
+ "reported size {reported} not larger than min expected size:
{min_expected}"
+ )
+ }
+
+ // Walks over the buckets / slots counting counting them all
+ struct BucketWalker {
+ num_buckets: usize,
+ num_slots: usize,
+ }
+
+ impl BucketWalker {
+ fn new() -> Self {
+ Self {
+ num_buckets: 0,
+ num_slots: 0,
+ }
+ }
+
+ // recursively visit the bucket and any slots/buckets contained
+ fn visit_bucket(mut self, bucket: &Bucket) -> Self {
+ self.num_buckets += 1;
+ let acc = bucket
+ .slots
+ .iter()
+ .fold(self, |acc, slot| acc.visit_slot(slot));
+
+ if let Some(next) = bucket.next.as_ref() {
+ acc.visit_bucket(next.as_ref())
+ } else {
+ acc
+ }
+ }
+
+ // recursively visit slot and any slots/buckets
+ fn visit_slot(mut self, slot: &Slot) -> Self {
+ self.num_slots += 1;
+ if let Some(child) = slot.child.as_ref() {
+ self.visit_bucket(child.as_ref())
+ } else {
+ self
+ }
+ }
+
+ // estimate how much memory is used just for Buckets / Slots
+ // (an underestimate of the total memory used for the
+ // interner as it doesn't contain any actual values)
+ fn memory_estimate(self) -> usize {
+ self.num_buckets * std::mem::size_of::<Bucket>()
+ + self.num_slots * std::mem::size_of::<Slot>()
+ }
+ }
}