This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch 53.0.0_maintenance
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/53.0.0_maintenance by this
push:
new 58f086fc28 Add `Array::shrink_to_fit(&mut self)` to 53.4.0 (#6790)
(#6817) (#6962)
58f086fc28 is described below
commit 58f086fc2859b7e50cb11171c2c5d7549ba16aec
Author: Emil Ernerfeldt <[email protected]>
AuthorDate: Mon Jan 13 18:35:30 2025 +0100
Add `Array::shrink_to_fit(&mut self)` to 53.4.0 (#6790) (#6817) (#6962)
* Add `Array::shrink_to_fit(&mut self)` (#6790)
* Add `Array::shrink_to_fit`
* Test that shrink_to_fit actually frees memory
* Make sure the buffer isn't shared in the test of shrink_to_fit
* Remove `#[inline]`
* Use `realloc` to reallocate the bytes
* Clean up test
* Improve docstring for `Array::shrink_to_fit`
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
* `Buffer::shrink_to_fit`: ignore shared buffers
* Improve comment in `ArrayRef::shrink_to_fit`
* Document why `try_realloc` is safe, and actually make it safe :)
* Improve testing of shrink_to_fit
* Fix a few corner cases, and improve test
* Add license header to new test file
---------
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
* Support shrink to empty (#6817)
* Support shrink to empty
* Docs
* Format
---------
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---
arrow-array/src/array/boolean_array.rs | 7 +
arrow-array/src/array/byte_array.rs | 8 ++
arrow-array/src/array/byte_view_array.rs | 43 +++---
arrow-array/src/array/dictionary_array.rs | 5 +
arrow-array/src/array/fixed_size_binary_array.rs | 7 +
arrow-array/src/array/fixed_size_list_array.rs | 7 +
arrow-array/src/array/list_array.rs | 8 ++
arrow-array/src/array/list_view_array.rs | 9 ++
arrow-array/src/array/map_array.rs | 8 ++
arrow-array/src/array/mod.rs | 15 +++
arrow-array/src/array/primitive_array.rs | 7 +
arrow-array/src/array/run_array.rs | 5 +
arrow-array/src/array/struct_array.rs | 7 +
arrow-array/src/array/union_array.rs | 11 ++
arrow-buffer/src/buffer/boolean.rs | 6 +
arrow-buffer/src/buffer/immutable.rs | 63 +++++++++
arrow-buffer/src/buffer/mutable.rs | 9 +-
arrow-buffer/src/buffer/null.rs | 5 +
arrow-buffer/src/buffer/offset.rs | 5 +
arrow-buffer/src/buffer/run.rs | 6 +
arrow-buffer/src/buffer/scalar.rs | 5 +
arrow-buffer/src/bytes.rs | 43 ++++++
arrow/tests/shrink_to_fit.rs | 159 +++++++++++++++++++++++
23 files changed, 428 insertions(+), 20 deletions(-)
diff --git a/arrow-array/src/array/boolean_array.rs
b/arrow-array/src/array/boolean_array.rs
index 0f95adacf1..9c2d4af8c4 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -308,6 +308,13 @@ impl Array for BooleanArray {
self.values.is_empty()
}
+ fn shrink_to_fit(&mut self) {
+ self.values.shrink_to_fit();
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ }
+
fn offset(&self) -> usize {
self.values.offset()
}
diff --git a/arrow-array/src/array/byte_array.rs
b/arrow-array/src/array/byte_array.rs
index bec0caab10..f2b2250708 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -453,6 +453,14 @@ impl<T: ByteArrayType> Array for GenericByteArray<T> {
self.value_offsets.len() <= 1
}
+ fn shrink_to_fit(&mut self) {
+ self.value_offsets.shrink_to_fit();
+ self.value_data.shrink_to_fit();
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index 81bb6a3855..9d2d396a52 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -430,31 +430,31 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
///
/// Before GC:
/// ```text
- /// ┌──────┐
- /// │......│
- /// │......│
- /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
+ /// ┌──────┐
+ /// │......│
+ /// │......│
+ /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
/// │ View 1 │─ ─ ─ ─ │......│ with data that
/// ├────────────────────┤ │......│ is not referred
/// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
- /// └────────────────────┘ │......│ View 2
- /// │......│
- /// 2 views, refer to │......│
- /// small portions of a └──────┘
- /// large buffer
+ /// └────────────────────┘ │......│ View 2
+ /// │......│
+ /// 2 views, refer to │......│
+ /// small portions of a └──────┘
+ /// large buffer
/// ```
- ///
+ ///
/// After GC:
///
/// ```text
/// ┌────────────────────┐ ┌─────┐ After gc, only
- /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
- /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
- /// │ View 2 │─ ─ ─ ─ └─────┘ the views is
- /// └────────────────────┘ left
- ///
- ///
- /// 2 views
+ /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
+ /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
+ /// │ View 2 │─ ─ ─ ─ └─────┘ the views is
+ /// └────────────────────┘ left
+ ///
+ ///
+ /// 2 views
/// ```
/// This method will compact the data buffers by recreating the view array
and only include the data
/// that is pointed to by the views.
@@ -575,6 +575,15 @@ impl<T: ByteViewType + ?Sized> Array for
GenericByteViewArray<T> {
self.views.is_empty()
}
+ fn shrink_to_fit(&mut self) {
+ self.views.shrink_to_fit();
+ self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
+ self.buffers.shrink_to_fit();
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/dictionary_array.rs
b/arrow-array/src/array/dictionary_array.rs
index 1187e16769..988bdbc7c9 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -720,6 +720,11 @@ impl<T: ArrowDictionaryKeyType> Array for
DictionaryArray<T> {
self.keys.is_empty()
}
+ fn shrink_to_fit(&mut self) {
+ self.keys.shrink_to_fit();
+ self.values.shrink_to_fit();
+ }
+
fn offset(&self) -> usize {
self.keys.offset()
}
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs
b/arrow-array/src/array/fixed_size_binary_array.rs
index 8f1489ee4c..25fe2e3dfe 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -602,6 +602,13 @@ impl Array for FixedSizeBinaryArray {
self.len == 0
}
+ fn shrink_to_fit(&mut self) {
+ self.value_data.shrink_to_fit();
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/fixed_size_list_array.rs
b/arrow-array/src/array/fixed_size_list_array.rs
index 00a3144a87..2e784343b0 100644
--- a/arrow-array/src/array/fixed_size_list_array.rs
+++ b/arrow-array/src/array/fixed_size_list_array.rs
@@ -401,6 +401,13 @@ impl Array for FixedSizeListArray {
self.len == 0
}
+ fn shrink_to_fit(&mut self) {
+ self.values.shrink_to_fit();
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/list_array.rs
b/arrow-array/src/array/list_array.rs
index 1fab0009f2..ee02b88772 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -485,6 +485,14 @@ impl<OffsetSize: OffsetSizeTrait> Array for
GenericListArray<OffsetSize> {
self.value_offsets.len() <= 1
}
+ fn shrink_to_fit(&mut self) {
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ self.values.shrink_to_fit();
+ self.value_offsets.shrink_to_fit();
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/list_view_array.rs
b/arrow-array/src/array/list_view_array.rs
index 4e949a6427..c59b178f7b 100644
--- a/arrow-array/src/array/list_view_array.rs
+++ b/arrow-array/src/array/list_view_array.rs
@@ -326,6 +326,15 @@ impl<OffsetSize: OffsetSizeTrait> Array for
GenericListViewArray<OffsetSize> {
self.value_sizes.is_empty()
}
+ fn shrink_to_fit(&mut self) {
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ self.values.shrink_to_fit();
+ self.value_offsets.shrink_to_fit();
+ self.value_sizes.shrink_to_fit();
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/map_array.rs
b/arrow-array/src/array/map_array.rs
index 254437630a..18a7c491aa 100644
--- a/arrow-array/src/array/map_array.rs
+++ b/arrow-array/src/array/map_array.rs
@@ -372,6 +372,14 @@ impl Array for MapArray {
self.value_offsets.len() <= 1
}
+ fn shrink_to_fit(&mut self) {
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ self.entries.shrink_to_fit();
+ self.value_offsets.shrink_to_fit();
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index 04d9883f5b..456b3acac0 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -167,6 +167,12 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// ```
fn is_empty(&self) -> bool;
+ /// Shrinks the capacity of any exclusively owned buffer as much as
possible
+ ///
+ /// Shared or externally allocated buffers will be ignored, and
+ /// any buffer offsets will be preserved.
+ fn shrink_to_fit(&mut self) {}
+
/// Returns the offset into the underlying data used by this array(-slice).
/// Note that the underlying data can be shared by many arrays.
/// This defaults to `0`.
@@ -366,6 +372,15 @@ impl Array for ArrayRef {
self.as_ref().is_empty()
}
+ /// For shared buffers, this is a no-op.
+ fn shrink_to_fit(&mut self) {
+ if let Some(slf) = Arc::get_mut(self) {
+ slf.shrink_to_fit();
+ } else {
+ // We ignore shared buffers.
+ }
+ }
+
fn offset(&self) -> usize {
self.as_ref().offset()
}
diff --git a/arrow-array/src/array/primitive_array.rs
b/arrow-array/src/array/primitive_array.rs
index 7b0d6c5ca1..bb7413bb85 100644
--- a/arrow-array/src/array/primitive_array.rs
+++ b/arrow-array/src/array/primitive_array.rs
@@ -1152,6 +1152,13 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
self.values.is_empty()
}
+ fn shrink_to_fit(&mut self) {
+ self.values.shrink_to_fit();
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/run_array.rs
b/arrow-array/src/array/run_array.rs
index dc4e6c96d9..b340bf9a90 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -330,6 +330,11 @@ impl<T: RunEndIndexType> Array for RunArray<T> {
self.run_ends.is_empty()
}
+ fn shrink_to_fit(&mut self) {
+ self.run_ends.shrink_to_fit();
+ self.values.shrink_to_fit();
+ }
+
fn offset(&self) -> usize {
self.run_ends.offset()
}
diff --git a/arrow-array/src/array/struct_array.rs
b/arrow-array/src/array/struct_array.rs
index 41eb8235e5..16deba1063 100644
--- a/arrow-array/src/array/struct_array.rs
+++ b/arrow-array/src/array/struct_array.rs
@@ -370,6 +370,13 @@ impl Array for StructArray {
self.len == 0
}
+ fn shrink_to_fit(&mut self) {
+ if let Some(nulls) = &mut self.nulls {
+ nulls.shrink_to_fit();
+ }
+ self.fields.iter_mut().for_each(|n| n.shrink_to_fit());
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-array/src/array/union_array.rs
b/arrow-array/src/array/union_array.rs
index 3c6da5a7b5..43019c659f 100644
--- a/arrow-array/src/array/union_array.rs
+++ b/arrow-array/src/array/union_array.rs
@@ -744,6 +744,17 @@ impl Array for UnionArray {
self.type_ids.is_empty()
}
+ fn shrink_to_fit(&mut self) {
+ self.type_ids.shrink_to_fit();
+ if let Some(offsets) = &mut self.offsets {
+ offsets.shrink_to_fit();
+ }
+ for array in self.fields.iter_mut().flatten() {
+ array.shrink_to_fit();
+ }
+ self.fields.shrink_to_fit();
+ }
+
fn offset(&self) -> usize {
0
}
diff --git a/arrow-buffer/src/buffer/boolean.rs
b/arrow-buffer/src/buffer/boolean.rs
index 49a75b468d..e2c892315b 100644
--- a/arrow-buffer/src/buffer/boolean.rs
+++ b/arrow-buffer/src/buffer/boolean.rs
@@ -125,6 +125,12 @@ impl BooleanBuffer {
self.len == 0
}
+ /// Free up unused memory.
+ pub fn shrink_to_fit(&mut self) {
+ // TODO(emilk): we could shrink even more in the case where we are a
small sub-slice of the full buffer
+ self.buffer.shrink_to_fit();
+ }
+
/// Returns the boolean value at index `i`.
///
/// # Panics
diff --git a/arrow-buffer/src/buffer/immutable.rs
b/arrow-buffer/src/buffer/immutable.rs
index 8d1a46583f..820ad04bf6 100644
--- a/arrow-buffer/src/buffer/immutable.rs
+++ b/arrow-buffer/src/buffer/immutable.rs
@@ -167,6 +167,41 @@ impl Buffer {
self.data.capacity()
}
+ /// Tried to shrink the capacity of the buffer as much as possible,
freeing unused memory.
+ ///
+ /// If the buffer is shared, this is a no-op.
+ ///
+ /// If the memory was allocated with a custom allocator, this is a no-op.
+ ///
+ /// If the capacity is already less than or equal to the desired capacity,
this is a no-op.
+ ///
+ /// The memory region will be reallocated using `std::alloc::realloc`.
+ pub fn shrink_to_fit(&mut self) {
+ let offset = self.ptr_offset();
+ let is_empty = self.is_empty();
+ let desired_capacity = if is_empty {
+ 0
+ } else {
+ // For realloc to work, we cannot free the elements before the
offset
+ offset + self.len()
+ };
+ if desired_capacity < self.capacity() {
+ if let Some(bytes) = Arc::get_mut(&mut self.data) {
+ if bytes.try_realloc(desired_capacity).is_ok() {
+ // Realloc complete - update our pointer into `bytes`:
+ self.ptr = if is_empty {
+ bytes.as_ptr()
+ } else {
+ // SAFETY: we kept all elements leading up to the
offset
+ unsafe { bytes.as_ptr().add(offset) }
+ }
+ } else {
+ // Failure to reallocate is fine; we just failed to free
up memory.
+ }
+ }
+ }
+ }
+
/// Returns whether the buffer is empty.
#[inline]
pub fn is_empty(&self) -> bool {
@@ -562,6 +597,34 @@ mod tests {
assert_eq!(buf2.slice_with_length(2, 1).as_slice(), &[10]);
}
+ #[test]
+ fn test_shrink_to_fit() {
+ let original = Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7]);
+ assert_eq!(original.as_slice(), &[0, 1, 2, 3, 4, 5, 6, 7]);
+ assert_eq!(original.capacity(), 64);
+
+ let slice = original.slice_with_length(2, 3);
+ drop(original); // Make sure the buffer isn't shared (or shrink_to_fit
won't work)
+ assert_eq!(slice.as_slice(), &[2, 3, 4]);
+ assert_eq!(slice.capacity(), 64);
+
+ let mut shrunk = slice;
+ shrunk.shrink_to_fit();
+ assert_eq!(shrunk.as_slice(), &[2, 3, 4]);
+ assert_eq!(shrunk.capacity(), 5); // shrink_to_fit is allowed to keep
the elements before the offset
+
+ // Test that we can handle empty slices:
+ let empty_slice = shrunk.slice_with_length(1, 0);
+ drop(shrunk); // Make sure the buffer isn't shared (or shrink_to_fit
won't work)
+ assert_eq!(empty_slice.as_slice(), &[]);
+ assert_eq!(empty_slice.capacity(), 5);
+
+ let mut shrunk_empty = empty_slice;
+ shrunk_empty.shrink_to_fit();
+ assert_eq!(shrunk_empty.as_slice(), &[]);
+ assert_eq!(shrunk_empty.capacity(), 0);
+ }
+
#[test]
#[should_panic(expected = "the offset of the new Buffer cannot exceed the
existing length")]
fn test_slice_offset_out_of_bound() {
diff --git a/arrow-buffer/src/buffer/mutable.rs
b/arrow-buffer/src/buffer/mutable.rs
index 7fcbd89dd2..285c7b10ef 100644
--- a/arrow-buffer/src/buffer/mutable.rs
+++ b/arrow-buffer/src/buffer/mutable.rs
@@ -483,10 +483,13 @@ impl MutableBuffer {
}
}
+/// Creates a non-null pointer with alignment of [`ALIGNMENT`]
+///
+/// This is similar to [`NonNull::dangling`]
#[inline]
-fn dangling_ptr() -> NonNull<u8> {
- // SAFETY: ALIGNMENT is a non-zero usize which is then casted
- // to a *mut T. Therefore, `ptr` is not null and the conditions for
+pub(crate) fn dangling_ptr() -> NonNull<u8> {
+ // SAFETY: ALIGNMENT is a non-zero usize which is then cast
+ // to a *mut u8. Therefore, `ptr` is not null and the conditions for
// calling new_unchecked() are respected.
#[cfg(miri)]
{
diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs
index c79aef3980..137d900ac8 100644
--- a/arrow-buffer/src/buffer/null.rs
+++ b/arrow-buffer/src/buffer/null.rs
@@ -130,6 +130,11 @@ impl NullBuffer {
self.buffer.is_empty()
}
+ /// Free up unused memory.
+ pub fn shrink_to_fit(&mut self) {
+ self.buffer.shrink_to_fit();
+ }
+
/// Returns the null count for this [`NullBuffer`]
#[inline]
pub fn null_count(&self) -> usize {
diff --git a/arrow-buffer/src/buffer/offset.rs
b/arrow-buffer/src/buffer/offset.rs
index e9087d3009..a6be2b67af 100644
--- a/arrow-buffer/src/buffer/offset.rs
+++ b/arrow-buffer/src/buffer/offset.rs
@@ -133,6 +133,11 @@ impl<O: ArrowNativeType> OffsetBuffer<O> {
Self(out.into())
}
+ /// Free up unused memory.
+ pub fn shrink_to_fit(&mut self) {
+ self.0.shrink_to_fit();
+ }
+
/// Returns the inner [`ScalarBuffer`]
pub fn inner(&self) -> &ScalarBuffer<O> {
&self.0
diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs
index 3dbbe344a0..cc6d19044f 100644
--- a/arrow-buffer/src/buffer/run.rs
+++ b/arrow-buffer/src/buffer/run.rs
@@ -136,6 +136,12 @@ where
self.len == 0
}
+ /// Free up unused memory.
+ pub fn shrink_to_fit(&mut self) {
+ // TODO(emilk): we could shrink even more in the case where we are a
small sub-slice of the full buffer
+ self.run_ends.shrink_to_fit();
+ }
+
/// Returns the values of this [`RunEndBuffer`] not including any offset
#[inline]
pub fn values(&self) -> &[E] {
diff --git a/arrow-buffer/src/buffer/scalar.rs
b/arrow-buffer/src/buffer/scalar.rs
index 343b8549e9..ab6c87168e 100644
--- a/arrow-buffer/src/buffer/scalar.rs
+++ b/arrow-buffer/src/buffer/scalar.rs
@@ -72,6 +72,11 @@ impl<T: ArrowNativeType> ScalarBuffer<T> {
buffer.slice_with_length(byte_offset, byte_len).into()
}
+ /// Free up unused memory.
+ pub fn shrink_to_fit(&mut self) {
+ self.buffer.shrink_to_fit();
+ }
+
/// Returns a zero-copy slice of this buffer with length `len` and
starting at `offset`
pub fn slice(&self, offset: usize, len: usize) -> Self {
Self::new(self.buffer.clone(), offset, len)
diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs
index ba61342d8e..77724137ae 100644
--- a/arrow-buffer/src/bytes.rs
+++ b/arrow-buffer/src/bytes.rs
@@ -24,6 +24,7 @@ use std::ptr::NonNull;
use std::{fmt::Debug, fmt::Formatter};
use crate::alloc::Deallocation;
+use crate::buffer::dangling_ptr;
/// A continuous, fixed-size, immutable memory region that knows how to
de-allocate itself.
///
@@ -96,6 +97,48 @@ impl Bytes {
}
}
+ /// Try to reallocate the underlying memory region to a new size (smaller
or larger).
+ ///
+ /// Only works for bytes allocated with the standard allocator.
+ /// Returns `Err` if the memory was allocated with a custom allocator,
+ /// or the call to `realloc` failed, for whatever reason.
+ /// In case of `Err`, the [`Bytes`] will remain as it was (i.e. have the
old size).
+ pub fn try_realloc(&mut self, new_len: usize) -> Result<(), ()> {
+ if let Deallocation::Standard(old_layout) = self.deallocation {
+ if old_layout.size() == new_len {
+ return Ok(()); // Nothing to do
+ }
+
+ if let Ok(new_layout) =
std::alloc::Layout::from_size_align(new_len, old_layout.align())
+ {
+ let old_ptr = self.ptr.as_ptr();
+
+ let new_ptr = match new_layout.size() {
+ 0 => {
+ // SAFETY: Verified that old_layout.size != new_len (0)
+ unsafe { std::alloc::dealloc(self.ptr.as_ptr(),
old_layout) };
+ Some(dangling_ptr())
+ }
+ // SAFETY: the call to `realloc` is safe if all the
following hold (from
https://doc.rust-lang.org/stable/std/alloc/trait.GlobalAlloc.html#method.realloc):
+ // * `old_ptr` must be currently allocated via this
allocator (guaranteed by the invariant/contract of `Bytes`)
+ // * `old_layout` must be the same layout that was used to
allocate that block of memory (same)
+ // * `new_len` must be greater than zero
+ // * `new_len`, when rounded up to the nearest multiple of
`layout.align()`, must not overflow `isize` (guaranteed by the success of
`Layout::from_size_align`)
+ _ => NonNull::new(unsafe { std::alloc::realloc(old_ptr,
old_layout, new_len) }),
+ };
+
+ if let Some(ptr) = new_ptr {
+ self.ptr = ptr;
+ self.len = new_len;
+ self.deallocation = Deallocation::Standard(new_layout);
+ return Ok(());
+ }
+ }
+ }
+
+ Err(())
+ }
+
#[inline]
pub(crate) fn deallocation(&self) -> &Deallocation {
&self.deallocation
diff --git a/arrow/tests/shrink_to_fit.rs b/arrow/tests/shrink_to_fit.rs
new file mode 100644
index 0000000000..5d7c2cf98b
--- /dev/null
+++ b/arrow/tests/shrink_to_fit.rs
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{
+ array::{Array, ArrayRef, ListArray, PrimitiveArray},
+ buffer::OffsetBuffer,
+ datatypes::{Field, UInt8Type},
+};
+
+/// Test that `shrink_to_fit` frees memory after concatenating a large number
of arrays.
+#[test]
+fn test_shrink_to_fit_after_concat() {
+ let array_len = 6_000;
+ let num_concats = 100;
+
+ let primitive_array: PrimitiveArray<UInt8Type> = (0..array_len)
+ .map(|v| (v % 255) as u8)
+ .collect::<Vec<_>>()
+ .into();
+ let primitive_array: ArrayRef = Arc::new(primitive_array);
+
+ let list_array: ArrayRef = Arc::new(ListArray::new(
+ Field::new_list_field(primitive_array.data_type().clone(),
false).into(),
+ OffsetBuffer::from_lengths([primitive_array.len()]),
+ primitive_array.clone(),
+ None,
+ ));
+
+ // Num bytes allocated globally and by this thread, respectively.
+ let (concatenated, _bytes_allocated_globally,
bytes_allocated_by_this_thread) =
+ memory_use(|| {
+ let mut concatenated = concatenate(num_concats,
list_array.clone());
+ concatenated.shrink_to_fit(); // This is what we're testing!
+ dbg!(concatenated.data_type());
+ concatenated
+ });
+ let expected_len = num_concats * array_len;
+ assert_eq!(bytes_used(concatenated.clone()), expected_len);
+ eprintln!("The concatenated array is {expected_len} B long. Amount of
memory used by this thread: {bytes_allocated_by_this_thread} B");
+
+ assert!(
+ expected_len <= bytes_allocated_by_this_thread,
+ "We must allocate at least as much space as the concatenated array"
+ );
+ assert!(
+ bytes_allocated_by_this_thread <= expected_len + expected_len / 100,
+ "We shouldn't have more than 1% memory overhead. In fact, we are using
{bytes_allocated_by_this_thread} B of memory for {expected_len} B of data"
+ );
+}
+
+fn concatenate(num_times: usize, array: ArrayRef) -> ArrayRef {
+ let mut concatenated = array.clone();
+ for _ in 0..num_times - 1 {
+ concatenated =
arrow::compute::kernels::concat::concat(&[&*concatenated, &*array]).unwrap();
+ }
+ concatenated
+}
+
+fn bytes_used(array: ArrayRef) -> usize {
+ let mut array = array;
+ loop {
+ match array.data_type() {
+ arrow::datatypes::DataType::UInt8 => break,
+ arrow::datatypes::DataType::List(_) => {
+ let list = array.as_any().downcast_ref::<ListArray>().unwrap();
+ array = list.values().clone();
+ }
+ _ => unreachable!(),
+ }
+ }
+
+ array.len()
+}
+
+// --- Memory tracking ---
+
+use std::{
+ alloc::Layout,
+ sync::{
+ atomic::{AtomicUsize, Ordering::Relaxed},
+ Arc,
+ },
+};
+
+static LIVE_BYTES_GLOBAL: AtomicUsize = AtomicUsize::new(0);
+
+thread_local! {
+ static LIVE_BYTES_IN_THREAD: AtomicUsize = const { AtomicUsize::new(0) } ;
+}
+
+pub struct TrackingAllocator {
+ allocator: std::alloc::System,
+}
+
+#[global_allocator]
+pub static GLOBAL_ALLOCATOR: TrackingAllocator = TrackingAllocator {
+ allocator: std::alloc::System,
+};
+
+#[allow(unsafe_code)]
+// SAFETY:
+// We just do book-keeping and then let another allocator do all the actual
work.
+unsafe impl std::alloc::GlobalAlloc for TrackingAllocator {
+ #[allow(clippy::let_and_return)]
+ unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+ // SAFETY:
+ // Just deferring
+ let ptr = unsafe { self.allocator.alloc(layout) };
+ if !ptr.is_null() {
+ LIVE_BYTES_IN_THREAD.with(|bytes| bytes.fetch_add(layout.size(),
Relaxed));
+ LIVE_BYTES_GLOBAL.fetch_add(layout.size(), Relaxed);
+ }
+ ptr
+ }
+
+ unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+ LIVE_BYTES_IN_THREAD.with(|bytes| bytes.fetch_sub(layout.size(),
Relaxed));
+ LIVE_BYTES_GLOBAL.fetch_sub(layout.size(), Relaxed);
+
+ // SAFETY:
+ // Just deferring
+ unsafe { self.allocator.dealloc(ptr, layout) };
+ }
+
+ // No need to override `alloc_zeroed` or `realloc`,
+ // since they both by default just defer to `alloc` and `dealloc`.
+}
+
+fn live_bytes_local() -> usize {
+ LIVE_BYTES_IN_THREAD.with(|bytes| bytes.load(Relaxed))
+}
+
+fn live_bytes_global() -> usize {
+ LIVE_BYTES_GLOBAL.load(Relaxed)
+}
+
+/// Returns `(num_bytes_allocated, num_bytes_allocated_by_this_thread)`.
+fn memory_use<R>(run: impl Fn() -> R) -> (R, usize, usize) {
+ let used_bytes_start_local = live_bytes_local();
+ let used_bytes_start_global = live_bytes_global();
+ let ret = run();
+ let bytes_used_local = live_bytes_local() - used_bytes_start_local;
+ let bytes_used_global = live_bytes_global() - used_bytes_start_global;
+ (ret, bytes_used_global, bytes_used_local)
+}