alamb commented on code in PR #5796:
URL: https://github.com/apache/arrow-rs/pull/5796#discussion_r1617762117
##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -15,21 +15,42 @@
// specific language governing permissions and limitations
// under the License.
-use crate::builder::ArrayBuilder;
-use crate::types::{BinaryViewType, ByteViewType, StringViewType};
-use crate::{ArrayRef, GenericByteViewArray};
-use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
-use arrow_data::ByteView;
-
use std::any::Any;
use std::marker::PhantomData;
use std::sync::Arc;
+use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
+use arrow_data::ByteView;
+use arrow_schema::ArrowError;
+
+use crate::builder::ArrayBuilder;
+use crate::types::bytes::ByteArrayNativeType;
+use crate::types::{BinaryViewType, ByteViewType, StringViewType};
+use crate::{ArrayRef, GenericByteViewArray};
+
const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024;
/// A builder for [`GenericByteViewArray`]
///
-/// See [`Self::append_value`] for the allocation strategy
+/// A [`GenericByteViewArray`] consists of a list of data blocks containing
string data,
+/// and a list of views into those buffers.
+///
+/// This builder can be used in two ways
+///
+/// # Append Values
+///
+/// To avoid bump allocating this builder allocates data in fixed size blocks,
configurable
Review Comment:
```suggestion
/// To avoid bump allocating, this builder allocates data in fixed size
blocks, configurable
```
##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -62,6 +83,98 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
Self { block_size, ..self }
}
+ /// Append a new data block returning the new block offset
+ ///
+ /// Note: this will first flush any in-progress block
+ ///
+ /// This allows appending views from blocks added using
[`Self::append_block`]. See
+ /// [`Self::append_value`] for appending individual values
+ ///
+ /// ```
+ /// # use arrow_array::builder::StringViewBuilder;
+ /// let mut builder = StringViewBuilder::new();
+ ///
+ /// let block = builder.append_block(b"helloworldbingobongo".into());
+ ///
+ /// builder.try_append_view(block, 0, 5).unwrap();
+ /// builder.try_append_view(block, 5, 5).unwrap();
+ /// builder.try_append_view(block, 10, 5).unwrap();
+ /// builder.try_append_view(block, 15, 5).unwrap();
+ /// builder.try_append_view(block, 0, 15).unwrap();
+ /// let array = builder.finish();
+ ///
+ /// let actual: Vec<_> = array.iter().flatten().collect();
+ /// let expected = &["hello", "world", "bingo", "bongo",
"helloworldbingo"];
+ /// assert_eq!(actual, expected);
+ /// ```
+ pub fn append_block(&mut self, buffer: Buffer) -> u32 {
+ assert!(buffer.len() < u32::MAX as usize);
+
+ self.flush_in_progress();
+ let offset = self.completed.len();
+ self.push_completed(buffer);
+ offset as u32
+ }
+
+ /// Try to append a view of the given `block`, `offset` and `length`
+ ///
+ /// See [`Self::append_block`]
+ pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) ->
Result<(), ArrowError> {
+ let b = self.completed.get(block as usize).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!("No block found with
index {block}"))
+ })?;
+ let start = offset as usize;
+ let end = start.saturating_add(len as usize);
+
+ let b = b.get(start..end).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Range {start}..{end} out of bounds for block of length {}",
+ b.len()
+ ))
+ })?;
+
+ if T::Native::from_bytes_checked(b).is_none() {
+ return Err(ArrowError::InvalidArgumentError(
+ "Invalid view data".to_string(),
+ ));
+ }
+
+ if len <= 12 {
Review Comment:
big fan of encapsulating this logic into the view builder
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]