This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 9acc9fa0b8 Minor API adjustments for StringViewBuilder (#6047)
9acc9fa0b8 is described below
commit 9acc9fa0b84755c5caffb8acf806fa4fd21928bb
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jul 15 05:52:37 2024 -0400
Minor API adjustments for StringViewBuilder (#6047)
* minor update
* add memory accounting
* Update arrow-buffer/src/builder/null.rs
Co-authored-by: Andrew Lamb <[email protected]>
* Update arrow-array/src/builder/generic_bytes_view_builder.rs
Co-authored-by: Andrew Lamb <[email protected]>
* update comments
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/src/array/byte_view_array.rs | 6 ++++--
arrow-array/src/builder/generic_bytes_view_builder.rs | 16 +++++++++++++++-
arrow-buffer/src/builder/null.rs | 8 ++++++++
3 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index a00bf7271b..5ce150a7a3 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -324,9 +324,11 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// Note that it will copy the array regardless of whether the original
array is compact.
/// Use with caution as this can be an expensive operation, only use it
when you are sure that the view
/// array is significantly smaller than when it is originally created,
e.g., after filtering or slicing.
+ ///
+ /// Note: this function does not attempt to canonicalize / deduplicate
values. For this
+ /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
pub fn gc(&self) -> Self {
- let mut builder =
-
GenericByteViewBuilder::<T>::with_capacity(self.len()).with_deduplicate_strings();
+ let mut builder =
GenericByteViewBuilder::<T>::with_capacity(self.len());
for v in self.iter() {
builder.append_option(v);
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs
b/arrow-array/src/builder/generic_bytes_view_builder.rs
index dda5535456..587255cc6b 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -201,7 +201,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
/// Returns the value at the given index
/// Useful if we want to know what value has been inserted to the builder
- fn get_value(&self, index: usize) -> &[u8] {
+ /// The index has to be smaller than `self.len()`, otherwise it will panic
+ pub fn get_value(&self, index: usize) -> &[u8] {
let view = self.views_builder.as_slice().get(index).unwrap();
let len = *view as u32;
if len <= 12 {
@@ -337,6 +338,19 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
+
+ /// Return the allocated size of this builder in bytes, useful for memory
accounting.
+ pub fn allocated_size(&self) -> usize {
+ let views = self.views_builder.capacity() *
std::mem::size_of::<u128>();
+ let null = self.null_buffer_builder.allocated_size();
+ let buffer_size = self.completed.iter().map(|b|
b.capacity()).sum::<usize>();
+ let in_progress = self.in_progress.capacity();
+ let tracker = match &self.string_tracker {
+ Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(),
+ None => 0,
+ };
+ buffer_size + in_progress + tracker + views + null
+ }
}
impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> {
diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs
index 55b3303c9e..a1cea6ef2c 100644
--- a/arrow-buffer/src/builder/null.rs
+++ b/arrow-buffer/src/builder/null.rs
@@ -161,6 +161,14 @@ impl NullBufferBuilder {
pub fn as_slice_mut(&mut self) -> Option<&mut [u8]> {
self.bitmap_builder.as_mut().map(|b| b.as_slice_mut())
}
+
+ /// Return the allocated size of this builder, in bytes, useful for memory
accounting.
+ pub fn allocated_size(&self) -> usize {
+ self.bitmap_builder
+ .as_ref()
+ .map(|b| b.capacity())
+ .unwrap_or(0)
+ }
}
impl NullBufferBuilder {