alamb commented on code in PR #5619:
URL: https://github.com/apache/arrow-rs/pull/5619#discussion_r1591006739
##########
arrow-data/src/transform/mod.rs:
##########
@@ -178,13 +178,17 @@ fn build_extend_view(array: &ArrayData, buffer_offset:
u32) -> Extend {
mutable
.buffer1
.extend(views[start..start + len].iter().map(|v| {
Review Comment:
Here is another example of how the magic 12 number gets extracted out
##########
arrow-data/src/byte_view.rs:
##########
@@ -15,10 +15,453 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_buffer::Buffer;
+use arrow_buffer::{Buffer, ToByteSlice};
use arrow_schema::ArrowError;
+use std::fmt::Formatter;
+use std::ops::Range;
-#[derive(Debug, Copy, Clone, Default)]
+/// A `View` is a `u128` value that represents a single value in a
+/// [`GenericByteViewArray`].
+///
+/// Depending on the array type, the value may be a utf8 string or simply
bytes.
+/// The layout of the u128 is different depending on the length of the bytes
+/// stored at that location:
+///
+/// # 12 or fewer bytes [`InlineView`]
+///
+/// Values with 12 or fewer bytes are stored directly inlined in the `u128`.
See
+/// [`InlineView`] for field access.
+///
+/// ```text
+///
┌───────────────────────────────────────────┬──────────────┐
+/// │ data │
length │
+/// Strings, len <= 12 │ (padded with \0) │
(u32) │
+/// (InlineView) │ │
│
+///
└───────────────────────────────────────────┴──────────────┘
+/// 127 31
0 bit
+///
offset
+/// ```
+///
+/// # More than 12 bytes [`OffsetView`]
+///
+/// Values with more than 12 bytes store the first 4 bytes inline, an offset
and
+/// buffer index that reference the actual data (including the first 4 bytes)
in
+/// an externally managed buffer. See [`OffsetView`] for field access.
+///
+/// ```text
+///
┌──────────────┬─────────────┬──────────────┬──────────────┐
+/// │buffer offset │ buffer index│ data prefix │
length │
+/// Strings, len > 12 │ (u32) │ (u32) │ (4 bytes) │
(u32) │
+/// (OffsetView) │ │ │ │
│
+///
└──────────────┴─────────────┴──────────────┴──────────────┘
+/// 127 95 63 31
0 bit
+///
offset
+/// ```
+///
+/// See Also:
+/// * [`OwnedView`]: An owned variant of [`View`], used for constructing views
+///
+/// [`GenericByteViewArray`]:
https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum View<'a> {
+ /// Entire string is inlined
+ Inline(InlineView<'a>),
+ /// String is stored in buffer, 4 byte prefix stored inline
+ Offset(OffsetView<'a>),
+}
+impl<'a> From<&'a u128> for View<'a> {
+ #[inline(always)]
+ fn from(v: &'a u128) -> Self {
+ let len = *v as u32;
+ if len <= 12 {
+ Self::Inline(InlineView::from(v))
+ } else {
+ Self::Offset(OffsetView::from(v))
+ }
+ }
+}
+
+/// Owned variant of [`View`] for constructing views from a string or byte
slice.
+///
+/// # Example
+/// ```
+/// # use arrow_data::OwnedView;
+/// // contruct a view from a string
+/// let view = OwnedView::new_from_str("hello");
+/// assert!(matches!(view, OwnedView::Inline(_)));
+/// ```
+///
+/// ```
+/// # use arrow_data::OwnedView;
+/// // contruct a view from a longer string
+/// let view = OwnedView::new_from_str("hello my name is crumple faced fish");
+/// assert!(matches!(view, OwnedView::Offset(_)));
+/// ```
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(PartialEq)]
+pub enum OwnedView {
+ /// [`InlineView`]: Data is inlined (12 or fewer bytes)
+ Inline(u128),
+ /// [`OffsetView`]: Data is stored in a buffer (more than 12 bytes)
+ Offset(u128),
+}
+
+impl OwnedView {
+ /// Create a new `OwnedView` from a preexisting u128 that represents a
view.
+ ///
+ /// Note no validation is done on the u128 (e.g. no length checking)
+ pub fn new(v: u128) -> Self {
+ let len = v as u32;
+ if len <= 12 {
+ Self::Inline(v)
+ } else {
+ Self::Offset(v)
+ }
+ }
+
+ /// Create a new view from a string
+ ///
+ /// See [`OwnedView::new_from_bytes`] for more details
+ pub fn new_from_str(value: &str) -> Self {
+ Self::new_from_bytes(value.as_bytes())
+ }
+
+ /// Construct an `OwnedView` from a byte slice
+ ///
+ /// This function constructs the appropriate view type to represent this
+ /// value, inlining the value or prefix as appropriate.
+ ///
+ /// # Notes:
+ /// * Does not manage any buffers / offsets
+ /// * A created [`OwnedView::Offset`] has buffer index and offset set to
zero
+ #[inline(always)]
+ pub fn new_from_bytes(v: &[u8]) -> Self {
+ let length: u32 = v.len().try_into().unwrap();
+ let mut view_buffer = [0; 16];
+ view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
+
+ if length <= 12 {
+ // copy all values
+ view_buffer[4..4 + v.len()].copy_from_slice(v);
+ Self::Inline(u128::from_le_bytes(view_buffer))
+ } else {
+ // copy 4 byte prefix
+ view_buffer[4..8].copy_from_slice(&v[0..4]);
+ Self::Offset(u128::from_le_bytes(view_buffer))
+ }
+ }
+
+ // Convert this `OwnedView` to a `View`
+ pub fn as_view(&self) -> View {
+ match self {
+ Self::Inline(inline) => View::Inline(InlineView::from(inline)),
+ Self::Offset(offset) => View::Offset(OffsetView::from(offset)),
+ }
+ }
+}
+
+impl std::fmt::Debug for OwnedView {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ // format with hex bytes
+ match self {
+ Self::Inline(inline) => write!(f,
"OwnedView::Inline({inline:#20x})"),
+ Self::Offset(offset) => write!(f,
"OwnedView::Offset({offset:#20x})"),
+ }
+ }
+}
+
+impl From<&str> for OwnedView {
+ fn from(value: &str) -> Self {
+ Self::new_from_str(value)
+ }
+}
+
+impl From<&[u8]> for OwnedView {
+ fn from(value: &[u8]) -> Self {
+ Self::new_from_bytes(value)
+ }
+}
+
+impl From<u128> for OwnedView {
+ fn from(value: u128) -> Self {
+ Self::new(value)
+ }
+}
+
+/// A view for data where the variable length data is less than or equal to 12.
+/// See documentation on [`View`] for details.
+///
+/// # Notes
+/// Note there is no validation done when converting to/from u128
+///
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(Copy, Clone, PartialEq)]
+pub struct InlineView<'a>(&'a u128);
+
+impl<'a> std::fmt::Debug for InlineView<'a> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ // format with hex bytes
+ write!(f, "InlineView({:#020x})", self.0)
+ }
+}
+
+impl<'a> InlineView<'a> {
+ /// Create a new inline view from a u128
+ #[inline(always)]
+ pub fn new_from_u128(v: &'a u128) -> Self {
+ Self(v)
+ }
+
+ /// Return a reference to the u128
+ pub fn as_u128(self) -> &'a u128 {
+ self.0
+ }
+
+ /// Convert this inline view to a u128
+ pub fn into_u128(self) -> u128 {
+ *self.0
+ }
+
+ /// Return the length of the data, in bytes
+ #[inline(always)]
+ pub fn len(&self) -> usize {
+ // take first 4 bytes
+ let len = *self.0 as u32;
+ len as usize
+ }
+
+ /// Return true of the length of the data is zero
+ #[inline(always)]
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Access the value of the data, as bytes
+ ///
+ /// # Panics
+ /// If the length is greater than 12 (aka if this view is invalid)
+ #[inline(always)]
+ pub fn as_bytes(&self) -> &[u8] {
+ &self.0.to_byte_slice()[4..4 + self.len()]
+ }
+
+ /// Access the value of the data, as bytes, unchecked
+ ///
+ /// # Safety
+ /// Undefined behavior if the length is greater than 12
+ #[inline(always)]
+ pub unsafe fn as_bytes_unchecked(&self) -> &[u8] {
+ self.get_bytes_unchecked(self.0)
+ }
+
+ /// Access the value of `v`, as bytes, unchecked described by this view
+ ///
+ /// This method can be used to access the inlined bytes described by this
+ /// view directly from a reference to the underlying `u128`.
+ ///
+ /// # Safety
+ /// Undefined behavior if the length is greater than 12
+ #[inline(always)]
+ pub unsafe fn get_bytes_unchecked<'b>(&self, v: &'b u128) -> &'b [u8] {
+ v.to_byte_slice().get_unchecked(4..4 + self.len())
+ }
+}
+
+impl<'a> From<&'a u128> for InlineView<'a> {
+ #[inline(always)]
+ fn from(v: &'a u128) -> Self {
+ Self::new_from_u128(v)
+ }
+}
+
+impl<'a> From<InlineView<'a>> for &'a u128 {
+ #[inline(always)]
+ fn from(view: InlineView<'a>) -> Self {
+ view.as_u128()
+ }
+}
+
+impl<'a> From<InlineView<'a>> for u128 {
+ #[inline(always)]
+ fn from(view: InlineView) -> Self {
+ view.into_u128()
+ }
+}
+
+/// A view for data where the length variable length data is greater than
+/// 12 bytes.
+///
+/// See documentation on [`View`] for details.
+///
+/// # Notes
+/// There is no validation done when converting to/from u128
+///
+/// # See Also
+/// * [`View`] to determine the correct view type for a given `u128`
+/// * [`OffsetViewBuilder`] for modifying the buffer index and offset of an
`OffsetView`
+#[derive(Copy, Clone, PartialEq)]
+pub struct OffsetView<'a>(&'a u128);
+
+impl<'a> std::fmt::Debug for OffsetView<'a> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ // format with hex bytes
+ write!(f, "OffsetView({:#020x})", self.0)
+ }
+}
+
+impl<'a> OffsetView<'a> {
+ /// Create a new inline view from a u128
+ #[inline(always)]
+ pub fn new_from_u128(v: &'a u128) -> Self {
+ Self(v)
+ }
+
+ /// Return a reference to the inner u128
+ pub fn as_u128(self) -> &'a u128 {
+ self.0
+ }
+
+ /// Convert this inline view to a u128
+ pub fn to_u128(&self) -> u128 {
+ *self.0
+ }
+
+ /// Return the length of the data, in bytes
+ #[inline(always)]
+ pub fn len(&self) -> usize {
+ // take first 4 bytes
+ let len = *self.0 as u32;
+ len as usize
+ }
+
+ /// Return true if the view represents an empty string
+ ///
+ /// # Notes
+ ///
+ /// Since an `OffsetView` is always greater than 12 bytes, this function
+ /// always returns false.
+ #[inline(always)]
+ pub fn is_empty(&self) -> bool {
+ false
+ }
+
+ /// Return the prefix of the data (always 4 bytes)
+ #[inline(always)]
+ pub fn prefix_as_bytes(&self) -> &[u8] {
+ &self.0.to_byte_slice()[4..8]
+ }
+
+ /// Return the buffer index
+ #[inline(always)]
+ pub fn buffer_index(&self) -> u32 {
+ (((*self.0) & 0x00000000_ffffffff_00000000_00000000) >> 64) as u32
+ }
+
+ /// Return the offset into the buffer
+ #[inline(always)]
+ pub fn offset(&self) -> u32 {
+ (((*self.0) & 0xffffffff_00000000_00000000_00000000) >> 96) as u32
+ }
+
+ /// Return the range of the data in the offset buffer
+ #[inline(always)]
+ pub fn range(&self) -> Range<usize> {
+ let offset = self.offset() as usize;
+ offset..(offset + self.len())
+ }
+
+ /// Return a builder for modifying this view
+ pub fn into_builder(&self) -> OffsetViewBuilder {
+ OffsetViewBuilder::new_from_u128(*self.0)
+ }
+}
+
+impl<'a> From<&'a u128> for OffsetView<'a> {
+ #[inline(always)]
+ fn from(v: &'a u128) -> Self {
+ Self::new_from_u128(v)
+ }
+}
+
+impl<'a> From<OffsetView<'a>> for &'a u128 {
+ #[inline(always)]
+ fn from(view: OffsetView<'a>) -> Self {
+ view.as_u128()
+ }
+}
+
+impl<'a> From<OffsetView<'a>> for u128 {
+ #[inline(always)]
+ fn from(view: OffsetView) -> Self {
+ view.to_u128()
+ }
+}
+
+/// Builder for [`OffsetView`]s
+///
+/// This builder can help set offset and buffer index of an `OffsetView`.
+///
+/// Note that the builder does not permit changing the length or prefix of the
+/// view. To change the length or prefix, create a new `OffsetView` using
+/// [`OwnedView::new_from_bytes`].
+#[derive(Clone, PartialEq)]
+pub struct OffsetViewBuilder(u128);
+
+impl std::fmt::Debug for OffsetViewBuilder {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ write!(f, "OffsetViewBuilder({:#020x})", self.0)
+ }
+}
+
+impl OffsetViewBuilder {
+ fn new_from_u128(v: u128) -> Self {
+ Self(v)
+ }
+
+ /// Retrieve the u128 as a OffsetView
+ pub fn as_offset_view(&self) -> OffsetView<'_> {
+ OffsetView::new_from_u128(&self.0)
+ }
+
+ /// Set the buffer index
+ pub fn with_buffer_index(self, buffer_index: u32) -> Self {
+ Self(self.0 | ((buffer_index as u128) << 64))
+ }
+
+ /// Set the offset
+ pub fn with_offset(self, offset: u32) -> Self {
+ Self(self.0 | ((offset as u128) << 96))
+ }
+
+ /// Return the inner u128, consuming the builder
+ pub fn build(self) -> u128 {
+ self.0
+ }
+}
+
+impl From<u128> for OffsetViewBuilder {
+ fn from(v: u128) -> Self {
+ Self::new_from_u128(v)
+ }
+}
+
+impl From<OffsetViewBuilder> for u128 {
+ fn from(builder: OffsetViewBuilder) -> Self {
+ builder.build()
+ }
+}
+
+/// A view for data where the variable length data has 12 or fewer bytes. See
Review Comment:
If this PR is merged, I will make a follow on PR to remove the remaining
uses of ByteView and deprecate this struct
##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -72,35 +72,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
#[inline]
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
let v: &[u8] = value.as_ref().as_ref();
- let length: u32 = v.len().try_into().unwrap();
- if length <= 12 {
- let mut view_buffer = [0; 16];
- view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
- view_buffer[4..4 + v.len()].copy_from_slice(v);
- self.views_builder.append(u128::from_le_bytes(view_buffer));
- self.null_buffer_builder.append_non_null();
- return;
- }
- let required_cap = self.in_progress.len() + v.len();
- if self.in_progress.capacity() < required_cap {
- let in_progress = Vec::with_capacity(v.len().max(self.block_size
as usize));
- let flushed = std::mem::replace(&mut self.in_progress,
in_progress);
- if !flushed.is_empty() {
- assert!(self.completed.len() < u32::MAX as usize);
- self.completed.push(flushed.into());
+ let view: u128 = match OwnedView::from(v) {
Review Comment:
I think the construction is now much clearer and encapsulated -- the bit
manipulation is handled by `OwnedView` and `OffsetViewBuilder`
##########
arrow-data/src/byte_view.rs:
##########
@@ -15,10 +15,453 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_buffer::Buffer;
+use arrow_buffer::{Buffer, ToByteSlice};
use arrow_schema::ArrowError;
+use std::fmt::Formatter;
+use std::ops::Range;
-#[derive(Debug, Copy, Clone, Default)]
+/// A `View` is a `u128` value that represents a single value in a
+/// [`GenericByteViewArray`].
+///
+/// Depending on the array type, the value may be a utf8 string or simply
bytes.
+/// The layout of the u128 is different depending on the length of the bytes
+/// stored at that location:
+///
+/// # 12 or fewer bytes [`InlineView`]
+///
+/// Values with 12 or fewer bytes are stored directly inlined in the `u128`.
See
+/// [`InlineView`] for field access.
+///
+/// ```text
+///
┌───────────────────────────────────────────┬──────────────┐
+/// │ data │
length │
+/// Strings, len <= 12 │ (padded with \0) │
(u32) │
+/// (InlineView) │ │
│
+///
└───────────────────────────────────────────┴──────────────┘
+/// 127 31
0 bit
+///
offset
+/// ```
+///
+/// # More than 12 bytes [`OffsetView`]
+///
+/// Values with more than 12 bytes store the first 4 bytes inline, an offset
and
+/// buffer index that reference the actual data (including the first 4 bytes)
in
+/// an externally managed buffer. See [`OffsetView`] for field access.
+///
+/// ```text
+///
┌──────────────┬─────────────┬──────────────┬──────────────┐
+/// │buffer offset │ buffer index│ data prefix │
length │
+/// Strings, len > 12 │ (u32) │ (u32) │ (4 bytes) │
(u32) │
+/// (OffsetView) │ │ │ │
│
+///
└──────────────┴─────────────┴──────────────┴──────────────┘
+/// 127 95 63 31
0 bit
+///
offset
+/// ```
+///
+/// See Also:
+/// * [`OwnedView`]: An owned variant of [`View`], used for constructing views
+///
+/// [`GenericByteViewArray`]:
https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum View<'a> {
+ /// Entire string is inlined
+ Inline(InlineView<'a>),
+ /// String is stored in buffer, 4 byte prefix stored inline
+ Offset(OffsetView<'a>),
+}
+impl<'a> From<&'a u128> for View<'a> {
+ #[inline(always)]
+ fn from(v: &'a u128) -> Self {
Review Comment:
this is the key API for a `View` -- dispatching to the correct variant
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -22,52 +22,36 @@ use crate::types::bytes::ByteArrayNativeType;
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
use crate::{Array, ArrayAccessor, ArrayRef};
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
-use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
+use arrow_data::{ArrayData, ArrayDataBuilder, OffsetView, View};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::Arc;
-/// [Variable-size Binary View Layout]: An array of variable length bytes view
arrays.
-///
-/// Different than [`crate::GenericByteArray`] as it stores both an offset and
length
-/// meaning that take / filter operations can be implemented without copying
the underlying data.
-///
-/// See [`StringViewArray`] for storing utf8 encoded string data and
-/// [`BinaryViewArray`] for storing bytes.
-///
-/// [Variable-size Binary View Layout]:
https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
+/// [Variable-size Binary View Layout]: An array of variable length byte
strings.
///
/// A `GenericByteViewArray` stores variable length byte strings. An array of
-/// `N` elements is stored as `N` fixed length "views" and a variable number
+/// `N` elements is stored as `N` fixed length [`View`]s and some number
/// of variable length "buffers".
///
-/// Each view is a `u128` value layout is different depending on the
-/// length of the string stored at that location:
+/// There are no constraints on offsets other than they must point into a valid
+/// buffer. The offsets can be out of order, non-continuous and overlapping.
///
-/// ```text
Review Comment:
moved to `View`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]