This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ec43d6fd5 Provide `into_builder` for bytearray (#3326)
ec43d6fd5 is described below
commit ec43d6fd5ebdd5f64b3556790d44bf96829e8ae8
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Sat Dec 31 10:28:21 2022 -0800
Provide `into_builder` for bytearray (#3326)
* Provide into_builder for bytearray
* For review
* Remove slices_mut
* Modify test and remove values_slice_mut
---
arrow-array/src/array/byte_array.rs | 86 ++++++++++++++++++++++++
arrow-array/src/array/string_array.rs | 25 +++++++
arrow-array/src/builder/generic_bytes_builder.rs | 40 ++++++++++-
arrow-array/src/builder/primitive_builder.rs | 4 +-
4 files changed, 152 insertions(+), 3 deletions(-)
diff --git a/arrow-array/src/array/byte_array.rs
b/arrow-array/src/array/byte_array.rs
index eb528384e..2cb04efb8 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -16,6 +16,7 @@
// under the License.
use crate::array::{empty_offsets, print_long_array};
+use crate::builder::GenericByteBuilder;
use crate::iterator::ArrayIter;
use crate::raw_pointer::RawPtrBox;
use crate::types::bytes::ByteArrayNativeType;
@@ -139,6 +140,91 @@ impl<T: ByteArrayType> GenericByteArray<T> {
pub fn iter(&self) -> ArrayIter<&Self> {
ArrayIter::new(self)
}
+
+ /// Returns `GenericByteBuilder` of this byte array for mutating its
values if the underlying
+ /// offset and data buffers are not shared by others.
+ pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
+ let len = self.len();
+ let null_bit_buffer = self
+ .data
+ .null_buffer()
+ .map(|b| b.bit_slice(self.data.offset(), len));
+
+ let element_len = std::mem::size_of::<T::Offset>();
+ let offset_buffer = self.data.buffers()[0]
+ .slice_with_length(self.data.offset() * element_len, (len + 1) *
element_len);
+
+ let element_len = std::mem::size_of::<u8>();
+ let value_len =
+ T::Offset::as_usize(self.value_offsets()[len] -
self.value_offsets()[0]);
+ let value_buffer = self.data.buffers()[1]
+ .slice_with_length(self.data.offset() * element_len, value_len *
element_len);
+
+ drop(self.data);
+
+ let try_mutable_null_buffer = match null_bit_buffer {
+ None => Ok(None),
+ Some(null_buffer) => {
+ // Null buffer exists, tries to make it mutable
+ null_buffer.into_mutable().map(Some)
+ }
+ };
+
+ let try_mutable_buffers = match try_mutable_null_buffer {
+ Ok(mutable_null_buffer) => {
+ // Got mutable null buffer, tries to get mutable value buffer
+ let try_mutable_offset_buffer = offset_buffer.into_mutable();
+ let try_mutable_value_buffer = value_buffer.into_mutable();
+
+ // try_mutable_offset_buffer.map(...).map_err(...) doesn't
work as the compiler complains
+ // mutable_null_buffer is moved into map closure.
+ match (try_mutable_offset_buffer, try_mutable_value_buffer) {
+ (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) =>
unsafe {
+ Ok(GenericByteBuilder::<T>::new_from_buffer(
+ mutable_offset_buffer,
+ mutable_value_buffer,
+ mutable_null_buffer,
+ ))
+ },
+ (Ok(mutable_offset_buffer), Err(value_buffer)) => Err((
+ mutable_offset_buffer.into(),
+ value_buffer,
+ mutable_null_buffer.map(|b| b.into()),
+ )),
+ (Err(offset_buffer), Ok(mutable_value_buffer)) => Err((
+ offset_buffer,
+ mutable_value_buffer.into(),
+ mutable_null_buffer.map(|b| b.into()),
+ )),
+ (Err(offset_buffer), Err(value_buffer)) => Err((
+ offset_buffer,
+ value_buffer,
+ mutable_null_buffer.map(|b| b.into()),
+ )),
+ }
+ }
+ Err(mutable_null_buffer) => {
+ // Unable to get mutable null buffer
+ Err((offset_buffer, value_buffer, Some(mutable_null_buffer)))
+ }
+ };
+
+ match try_mutable_buffers {
+ Ok(builder) => Ok(builder),
+ Err((offset_buffer, value_buffer, null_bit_buffer)) => {
+ let builder = ArrayData::builder(T::DATA_TYPE)
+ .len(len)
+ .add_buffer(offset_buffer)
+ .add_buffer(value_buffer)
+ .null_bit_buffer(null_bit_buffer);
+
+ let array_data = unsafe { builder.build_unchecked() };
+ let array = GenericByteArray::<T>::from(array_data);
+
+ Err(array)
+ }
+ }
+ }
}
impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> {
diff --git a/arrow-array/src/array/string_array.rs
b/arrow-array/src/array/string_array.rs
index c8db589e3..4a4152adc 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -697,4 +697,29 @@ mod tests {
assert_eq!(string.len(), 0);
assert_eq!(string.value_offsets(), &[0]);
}
+
+ #[test]
+ fn test_into_builder() {
+ let array: StringArray = vec!["hello", "arrow"].into();
+
+ // Append values
+ let mut builder = array.into_builder().unwrap();
+
+ builder.append_value("rust");
+
+ let expected: StringArray = vec!["hello", "arrow", "rust"].into();
+ let array = builder.finish();
+ assert_eq!(expected, array);
+ }
+
+ #[test]
+ fn test_into_builder_err() {
+ let array: StringArray = vec!["hello", "arrow"].into();
+
+ // Clone it, so we cannot get a mutable builder back
+ let shared_array = array.clone();
+
+ let err_return = array.into_builder().unwrap_err();
+ assert_eq!(&err_return, &shared_array);
+ }
}
diff --git a/arrow-array/src/builder/generic_bytes_builder.rs
b/arrow-array/src/builder/generic_bytes_builder.rs
index 9f9078c70..195628f47 100644
--- a/arrow-array/src/builder/generic_bytes_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_builder.rs
@@ -19,7 +19,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder;
use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait};
-use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_data::ArrayDataBuilder;
use std::any::Any;
use std::sync::Arc;
@@ -53,6 +53,34 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
}
}
+ /// Creates a new [`GenericByteBuilder`] from buffers.
+ ///
+ /// # Safety
+ /// This doesn't verify buffer contents as it assumes the buffers are from
existing and
+ /// valid [`GenericByteArray`].
+ pub unsafe fn new_from_buffer(
+ offsets_buffer: MutableBuffer,
+ value_buffer: MutableBuffer,
+ null_buffer: Option<MutableBuffer>,
+ ) -> Self {
+ let offsets_builder =
BufferBuilder::<T::Offset>::new_from_buffer(offsets_buffer);
+ let value_builder = BufferBuilder::<u8>::new_from_buffer(value_buffer);
+
+ let null_buffer_builder = null_buffer
+ .map(|buffer| {
+ NullBufferBuilder::new_from_buffer(buffer,
offsets_builder.len() - 1)
+ })
+ .unwrap_or_else(|| {
+ NullBufferBuilder::new_with_len(offsets_builder.len() - 1)
+ });
+
+ Self {
+ offsets_builder,
+ value_builder,
+ null_buffer_builder,
+ }
+ }
+
/// Appends a value into the builder.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
@@ -122,6 +150,16 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
pub fn offsets_slice(&self) -> &[T::Offset] {
self.offsets_builder.as_slice()
}
+
+ /// Returns the current null buffer as a slice
+ pub fn validity_slice(&self) -> Option<&[u8]> {
+ self.null_buffer_builder.as_slice()
+ }
+
+ /// Returns the current null buffer as a mutable slice
+ pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
+ self.null_buffer_builder.as_slice_mut()
+ }
}
impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
diff --git a/arrow-array/src/builder/primitive_builder.rs
b/arrow-array/src/builder/primitive_builder.rs
index fa1dc3ad1..f3f3f3728 100644
--- a/arrow-array/src/builder/primitive_builder.rs
+++ b/arrow-array/src/builder/primitive_builder.rs
@@ -286,12 +286,12 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
self.values_builder.as_slice_mut()
}
- /// Returns the current values buffer as a slice
+ /// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
- /// Returns the current values buffer as a mutable slice
+ /// Returns the current null buffer as a mutable slice
pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
self.null_buffer_builder.as_slice_mut()
}