This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new c096172b76 Track the memory usage of custom allocations so that their 
size can be reported via Array::get_buffer_memory_size (#5347)
c096172b76 is described below

commit c096172b769da8003ea7816086df60f38229a891
Author: Jörn Horstmann <[email protected]>
AuthorDate: Wed Jan 31 11:35:57 2024 +0100

    Track the memory usage of custom allocations so that their size can be 
reported via Array::get_buffer_memory_size (#5347)
---
 arrow-array/src/array/mod.rs         | 13 +++++++++++++
 arrow-buffer/src/alloc/mod.rs        | 21 ++++++++++++++++++---
 arrow-buffer/src/buffer/immutable.rs |  2 +-
 arrow-buffer/src/buffer/scalar.rs    |  2 +-
 arrow-buffer/src/bytes.rs            | 13 +++++++------
 5 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index f19406c161..1a58598543 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -272,6 +272,8 @@ pub trait Array: std::fmt::Debug + Send + Sync {
 
     /// Returns the total number of bytes of memory pointed to by this array.
     /// The buffers store bytes in the Arrow memory format, and include the 
data as well as the validity map.
+    /// Note that this does not always correspond to the exact memory usage of 
an array,
+    /// since multiple arrays can share the same buffers or slices thereof.
     fn get_buffer_memory_size(&self) -> usize;
 
     /// Returns the total number of bytes of memory occupied physically by 
this array.
@@ -934,6 +936,17 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_memory_size_primitive_sliced() {
+        let arr = PrimitiveArray::<Int64Type>::from_iter_values(0..128);
+        let slice1 = arr.slice(0, 64);
+        let slice2 = arr.slice(64, 64);
+
+        // both slices report the full buffer memory usage, even though the 
buffers are shared
+        assert_eq!(slice1.get_array_memory_size(), 
arr.get_array_memory_size());
+        assert_eq!(slice2.get_array_memory_size(), 
arr.get_array_memory_size());
+    }
+
     #[test]
     fn test_memory_size_primitive_nullable() {
         let arr: PrimitiveArray<Int64Type> = (0..128)
diff --git a/arrow-buffer/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs
index a3cb6253f3..d7108d2969 100644
--- a/arrow-buffer/src/alloc/mod.rs
+++ b/arrow-buffer/src/alloc/mod.rs
@@ -38,7 +38,9 @@ pub(crate) enum Deallocation {
     Standard(Layout),
     /// An allocation from an external source like the FFI interface
     /// Deallocation will happen on `Allocation::drop`
-    Custom(Arc<dyn Allocation>),
+    /// The size of the allocation is tracked here separately only
+    /// for memory usage reporting via `Array::get_buffer_memory_size`
+    Custom(Arc<dyn Allocation>, usize),
 }
 
 impl Debug for Deallocation {
@@ -47,9 +49,22 @@ impl Debug for Deallocation {
             Deallocation::Standard(layout) => {
                 write!(f, "Deallocation::Standard {layout:?}")
             }
-            Deallocation::Custom(_) => {
-                write!(f, "Deallocation::Custom {{ capacity: unknown }}")
+            Deallocation::Custom(_, size) => {
+                write!(f, "Deallocation::Custom {{ capacity: {size} }}")
             }
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::alloc::Deallocation;
+
+    #[test]
+    fn test_size_of_deallocation() {
+        assert_eq!(
+            std::mem::size_of::<Deallocation>(),
+            3 * std::mem::size_of::<usize>()
+        );
+    }
+}
diff --git a/arrow-buffer/src/buffer/immutable.rs 
b/arrow-buffer/src/buffer/immutable.rs
index 9db8732f36..f7fc7cffdc 100644
--- a/arrow-buffer/src/buffer/immutable.rs
+++ b/arrow-buffer/src/buffer/immutable.rs
@@ -124,7 +124,7 @@ impl Buffer {
         len: usize,
         owner: Arc<dyn Allocation>,
     ) -> Self {
-        Buffer::build_with_arguments(ptr, len, Deallocation::Custom(owner))
+        Buffer::build_with_arguments(ptr, len, Deallocation::Custom(owner, 
len))
     }
 
     /// Auxiliary method to create a new Buffer
diff --git a/arrow-buffer/src/buffer/scalar.rs 
b/arrow-buffer/src/buffer/scalar.rs
index f1c2ae7857..3826d74e43 100644
--- a/arrow-buffer/src/buffer/scalar.rs
+++ b/arrow-buffer/src/buffer/scalar.rs
@@ -134,7 +134,7 @@ impl<T: ArrowNativeType> From<Buffer> for ScalarBuffer<T> {
                 is_aligned,
                 "Memory pointer is not aligned with the specified scalar type"
             ),
-            Deallocation::Custom(_) =>
+            Deallocation::Custom(_, _) =>
                 assert!(is_aligned, "Memory pointer from external source (e.g, 
FFI) is not aligned with the specified scalar type. Before importing buffer 
through FFI, please make sure the allocation is aligned."),
         }
 
diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs
index 81860b6048..ba61342d8e 100644
--- a/arrow-buffer/src/bytes.rs
+++ b/arrow-buffer/src/bytes.rs
@@ -90,9 +90,9 @@ impl Bytes {
     pub fn capacity(&self) -> usize {
         match self.deallocation {
             Deallocation::Standard(layout) => layout.size(),
-            // we cannot determine this in general,
-            // and thus we state that this is externally-owned memory
-            Deallocation::Custom(_) => 0,
+            // we only know the size of the custom allocation
+            // its underlying capacity might be larger
+            Deallocation::Custom(_, size) => size,
         }
     }
 
@@ -116,7 +116,7 @@ impl Drop for Bytes {
                 _ => unsafe { std::alloc::dealloc(self.ptr.as_ptr(), *layout) 
},
             },
             // The automatic drop implementation will free the memory once the 
reference count reaches zero
-            Deallocation::Custom(_allocation) => (),
+            Deallocation::Custom(_allocation, _size) => (),
         }
     }
 }
@@ -147,10 +147,11 @@ impl Debug for Bytes {
 
 impl From<bytes::Bytes> for Bytes {
     fn from(value: bytes::Bytes) -> Self {
+        let len = value.len();
         Self {
-            len: value.len(),
+            len,
             ptr: NonNull::new(value.as_ptr() as _).unwrap(),
-            deallocation: Deallocation::Custom(std::sync::Arc::new(value)),
+            deallocation: Deallocation::Custom(std::sync::Arc::new(value), 
len),
         }
     }
 }

Reply via email to