This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 1e9e5a2cb Add better documentation, examples and builer-style API to
`ByteView` (#6479)
1e9e5a2cb is described below
commit 1e9e5a2cbf9b5ba43730cb634ff4b913b0e16324
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Oct 1 16:02:36 2024 -0400
Add better documentation, examples and builer-style API to `ByteView`
(#6479)
* Add better documentation, examples and builer-style API to `ByteView`
* simplify tests
* Apply suggestions from code review
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
* Moar inline
* Update arrow-data/src/byte_view.rs
Co-authored-by: Xiangpeng Hao <[email protected]>
* fmt
* Update test so it fails at the right place
---------
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
Co-authored-by: Xiangpeng Hao <[email protected]>
---
arrow-array/src/array/byte_view_array.rs | 29 +++++++------
arrow-data/src/byte_view.rs | 72 +++++++++++++++++++++++++++++++-
2 files changed, 84 insertions(+), 17 deletions(-)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index c53478d8b..b1b558057 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -44,8 +44,11 @@ use super::ByteArrayType;
///
/// # See Also
///
-/// See [`StringViewArray`] for storing utf8 encoded string data and
-/// [`BinaryViewArray`] for storing bytes.
+/// * [`StringViewArray`] for storing utf8 encoded string data
+/// * [`BinaryViewArray`] for storing bytes
+/// * [`ByteView`] to interpret `u128`s layout of the views.
+///
+/// [`ByteView`]: arrow_data::ByteView
///
/// # Notes
///
@@ -872,12 +875,9 @@ mod tests {
#[should_panic(expected = "Invalid buffer index at 0: got index 3 but only
has 1 buffers")]
fn new_with_invalid_view_data() {
let v = "large payload over 12 bytes";
- let view = ByteView {
- length: 13,
- prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()),
- buffer_index: 3,
- offset: 1,
- };
+ let view = ByteView::new(13, &v.as_bytes()[0..4])
+ .with_buffer_index(3)
+ .with_offset(1);
let views = ScalarBuffer::from(vec![view.into()]);
let buffers = vec![Buffer::from_slice_ref(v)];
StringViewArray::new(views, buffers, None);
@@ -888,13 +888,12 @@ mod tests {
expected = "Encountered non-UTF-8 data at index 0: invalid utf-8
sequence of 1 bytes from index 0"
)]
fn new_with_invalid_utf8_data() {
- let v: Vec<u8> = vec![0xf0, 0x80, 0x80, 0x80];
- let view = ByteView {
- length: v.len() as u32,
- prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
- buffer_index: 0,
- offset: 0,
- };
+ let v: Vec<u8> = vec![
+ // invalid UTF8
+ 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00,
+ ];
+ let view = ByteView::new(v.len() as u32, &v[0..4]);
let views = ScalarBuffer::from(vec![view.into()]);
let buffers = vec![Buffer::from_slice_ref(v)];
StringViewArray::new(views, buffers, None);
diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs
index 6f6d6d175..3b3ec6246 100644
--- a/arrow-data/src/byte_view.rs
+++ b/arrow-data/src/byte_view.rs
@@ -21,8 +21,40 @@ use arrow_schema::ArrowError;
/// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and
/// `BinaryViewArray`) where the length is greater than 12 bytes.
///
-/// See the documentation on [`GenericByteViewArray`] for more information on
-/// the layout of the views.
+/// See Also:
+/// * [`GenericByteViewArray`] for more information on the layout of the views.
+/// * [`validate_binary_view`] and [`validate_string_view`] to validate
+///
+/// # Example: Create a new u128 view
+///
+/// ```rust
+/// # use arrow_data::ByteView;;
+/// // Create a view for a string of length 20
+/// // first four bytes are "Rust"
+/// // stored in buffer 3
+/// // at offset 42
+/// let prefix = "Rust";
+/// let view = ByteView::new(20, prefix.as_bytes())
+/// .with_buffer_index(3)
+/// .with_offset(42);
+///
+/// // create the final u128
+/// let v = view.as_u128();
+/// assert_eq!(v, 0x2a000000037473755200000014);
+/// ```
+///
+/// # Example: decode a `u128` into its constituent fields
+/// ```rust
+/// # use arrow_data::ByteView;
+/// // Convert a u128 to a ByteView
+/// // See validate_{string,binary}_view functions to validate
+/// let v = ByteView::from(0x2a000000037473755200000014);
+///
+/// assert_eq!(v.length, 20);
+/// assert_eq!(v.prefix, 0x74737552);
+/// assert_eq!(v.buffer_index, 3);
+/// assert_eq!(v.offset, 42);
+/// ```
///
/// [`GenericByteViewArray`]:
https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
#[derive(Debug, Copy, Clone, Default)]
@@ -39,6 +71,42 @@ pub struct ByteView {
}
impl ByteView {
+ /// Construct a [`ByteView`] for data `length` of bytes with the specified
prefix.
+ ///
+ /// See example on [`ByteView`] docs
+ ///
+ /// Notes:
+ /// * the length should always be greater than 12 (Data less than 12
+ /// bytes is stored as an inline view)
+ /// * buffer and offset are set to `0`
+ ///
+ /// # Panics
+ /// If the prefix is not exactly 4 bytes
+ #[inline]
+ pub fn new(length: u32, prefix: &[u8]) -> Self {
+ debug_assert!(length > 12);
+ Self {
+ length,
+ prefix: u32::from_le_bytes(prefix.try_into().unwrap()),
+ buffer_index: 0,
+ offset: 0,
+ }
+ }
+
+ /// Set the [`Self::buffer_index`] field
+ #[inline]
+ pub fn with_buffer_index(mut self, buffer_index: u32) -> Self {
+ self.buffer_index = buffer_index;
+ self
+ }
+
+ /// Set the [`Self::offset`] field
+ #[inline]
+ pub fn with_offset(mut self, offset: u32) -> Self {
+ self.offset = offset;
+ self
+ }
+
#[inline(always)]
/// Convert `ByteView` to `u128` by concatenating the fields
pub fn as_u128(self) -> u128 {