tustvold commented on code in PR #5619:
URL: https://github.com/apache/arrow-rs/pull/5619#discussion_r1593438379
##########
arrow-data/src/byte_view.rs:
##########
@@ -15,10 +15,477 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_buffer::Buffer;
+use arrow_buffer::{Buffer, ToByteSlice};
use arrow_schema::ArrowError;
+use std::fmt::Formatter;
+use std::ops::Range;
-#[derive(Debug, Copy, Clone, Default)]
+/// A `View` is a `u128` value that represents a single value in a
+/// [`GenericByteViewArray`].
+///
+/// Depending on the array type, the value may be a utf8 string or simply
bytes.
+/// The layout of the u128 is different depending on the length of the bytes
+/// stored at that location:
+///
+/// # 12 or fewer bytes [`InlineView`]
+///
+/// Values with 12 or fewer bytes are stored directly inlined in the `u128`.
See
+/// [`InlineView`] for field access.
+///
+/// ```text
+///
┌───────────────────────────────────────────┬──────────────┐
+/// │ data │
length │
+/// Strings, len <= 12 │ (padded with \0) │
(u32) │
+/// (InlineView) │ │
│
+///
└───────────────────────────────────────────┴──────────────┘
+/// 127 31
0 bit
+///
offset
+/// ```
+///
+/// # More than 12 bytes [`OffsetView`]
+///
+/// Values with more than 12 bytes store the first 4 bytes inline, an offset
and
+/// buffer index that reference the actual data (including the first 4 bytes)
in
+/// an externally managed buffer. See [`OffsetView`] for field access.
+///
+/// ```text
+///
┌──────────────┬─────────────┬──────────────┬──────────────┐
+/// │buffer offset │ buffer index│ data prefix │
length │
+/// Strings, len > 12 │ (u32) │ (u32) │ (4 bytes) │
(u32) │
+/// (OffsetView) │ │ │ │
│
+///
└──────────────┴─────────────┴──────────────┴──────────────┘
+/// 127 95 63 31
0 bit
+///
offset
+/// ```
+///
+/// See Also:
+/// * [`OwnedView`]: An owned variant of [`View`], used for constructing views
+///
+/// [`GenericByteViewArray`]:
https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum View<'a> {
Review Comment:
This should probably be `ByteView` to avoid confusion with the list view
types
##########
arrow-data/src/byte_view.rs:
##########
@@ -15,10 +15,477 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_buffer::Buffer;
+use arrow_buffer::{Buffer, ToByteSlice};
use arrow_schema::ArrowError;
+use std::fmt::Formatter;
+use std::ops::Range;
-#[derive(Debug, Copy, Clone, Default)]
+/// A `View` is a `u128` value that represents a single value in a
+/// [`GenericByteViewArray`].
+///
+/// Depending on the array type, the value may be a utf8 string or simply
bytes.
+/// The layout of the u128 is different depending on the length of the bytes
+/// stored at that location:
+///
+/// # 12 or fewer bytes [`InlineView`]
+///
+/// Values with 12 or fewer bytes are stored directly inlined in the `u128`.
See
+/// [`InlineView`] for field access.
+///
+/// ```text
+///
┌───────────────────────────────────────────┬──────────────┐
+/// │ data │
length │
+/// Strings, len <= 12 │ (padded with \0) │
(u32) │
+/// (InlineView) │ │
│
+///
└───────────────────────────────────────────┴──────────────┘
+/// 127 31
0 bit
+///
offset
+/// ```
+///
+/// # More than 12 bytes [`OffsetView`]
+///
+/// Values with more than 12 bytes store the first 4 bytes inline, an offset
and
+/// buffer index that reference the actual data (including the first 4 bytes)
in
+/// an externally managed buffer. See [`OffsetView`] for field access.
+///
+/// ```text
+///
┌──────────────┬─────────────┬──────────────┬──────────────┐
+/// │buffer offset │ buffer index│ data prefix │
length │
+/// Strings, len > 12 │ (u32) │ (u32) │ (4 bytes) │
(u32) │
+/// (OffsetView) │ │ │ │
│
+///
└──────────────┴─────────────┴──────────────┴──────────────┘
+/// 127 95 63 31
0 bit
+///
offset
+/// ```
+///
+/// See Also:
+/// * [`OwnedView`]: An owned variant of [`View`], used for constructing views
+///
+/// [`GenericByteViewArray`]:
https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum View<'a> {
+ /// Entire string is inlined
+ Inline(InlineView<'a>),
+ /// String is stored in buffer, 4 byte prefix stored inline
+ Offset(OffsetView<'a>),
+}
+
+impl<'a> View<'a> {
+ /// Create a new `View` representing the contents of a `u128`
+ #[inline(always)]
+ pub fn new(v: &'a u128) -> Self {
+ let len = *v as u32;
+ if len <= 12 {
+ Self::Inline(InlineView::from(v))
+ } else {
+ Self::Offset(OffsetView::from(v))
+ }
+ }
+
+ /// Convert the view to a `u128`
+ pub fn to_u128(&self) -> u128 {
+ match self {
+ Self::Inline(inline) => inline.to_u128(),
+ Self::Offset(offset) => offset.to_u128(),
+ }
+ }
+
+ /// Return an [`OwnedView`] representing this view
+ pub fn to_owned(&self) -> OwnedView {
+ OwnedView::new(self.to_u128())
+ }
+}
+
+impl<'a> From<&'a u128> for View<'a> {
+ #[inline(always)]
+ fn from(v: &'a u128) -> Self {
+ Self::new(v)
+ }
+}
+
+/// Owned variant of [`View`] for constructing views from a string or byte
slice.
+///
+/// # Example
+/// ```
+/// # use arrow_data::OwnedView;
+/// // contruct a view from a string
+/// let view = OwnedView::new_from_str("hello");
+/// assert!(matches!(view, OwnedView::Inline(_)));
+/// ```
+///
+/// ```
+/// # use arrow_data::OwnedView;
+/// // contruct a view from a longer string
+/// let view = OwnedView::new_from_str("hello my name is crumple faced fish");
+/// assert!(matches!(view, OwnedView::Offset(_)));
+/// ```
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(PartialEq)]
+pub enum OwnedView {
+ /// [`InlineView`]: Data is inlined (12 or fewer bytes)
+ Inline(u128),
+ /// [`OffsetView`]: Data is stored in a buffer (more than 12 bytes)
+ Offset(u128),
+}
+
+impl OwnedView {
+ /// Create a new `OwnedView` from a preexisting u128 that represents a
view.
+ ///
+ /// Note no validation is done on the u128 (e.g. no length checking)
+ pub fn new(v: u128) -> Self {
+ let len = v as u32;
+ if len <= 12 {
+ Self::Inline(v)
+ } else {
+ Self::Offset(v)
+ }
+ }
+
+ /// Create a new view from a string
+ ///
+ /// See [`OwnedView::new_from_bytes`] for more details
+ pub fn new_from_str(value: &str) -> Self {
+ Self::new_from_bytes(value.as_bytes())
+ }
+
+ /// Construct an `OwnedView` from a byte slice
+ ///
+ /// This function constructs the appropriate view type to represent this
+ /// value, inlining the value or prefix as appropriate.
+ ///
+ /// # Notes:
+ /// * Does not manage any buffers / offsets
+ /// * A created [`OwnedView::Offset`] has buffer index and offset set to
zero
+ #[inline(always)]
+ pub fn new_from_bytes(v: &[u8]) -> Self {
+ let length: u32 = v.len().try_into().unwrap();
+ let mut view_buffer = [0; 16];
+ view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
+
+ if length <= 12 {
+ // copy all values
+ view_buffer[4..4 + v.len()].copy_from_slice(v);
+ Self::Inline(u128::from_le_bytes(view_buffer))
+ } else {
+ // copy 4 byte prefix
+ view_buffer[4..8].copy_from_slice(&v[0..4]);
+ Self::Offset(u128::from_le_bytes(view_buffer))
+ }
+ }
+
+ // Convert this `OwnedView` to a `View`
+ pub fn as_view(&self) -> View {
+ match self {
+ Self::Inline(inline) => View::Inline(InlineView::from(inline)),
+ Self::Offset(offset) => View::Offset(OffsetView::from(offset)),
+ }
+ }
+}
+
+impl std::fmt::Debug for OwnedView {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ // format with hex bytes
+ match self {
+ Self::Inline(inline) => write!(f,
"OwnedView::Inline({inline:#20x})"),
+ Self::Offset(offset) => write!(f,
"OwnedView::Offset({offset:#20x})"),
+ }
+ }
+}
+
+impl From<&str> for OwnedView {
+ fn from(value: &str) -> Self {
+ Self::new_from_str(value)
+ }
+}
+
+impl From<&[u8]> for OwnedView {
+ fn from(value: &[u8]) -> Self {
+ Self::new_from_bytes(value)
+ }
+}
+
+impl From<u128> for OwnedView {
+ fn from(value: u128) -> Self {
+ Self::new(value)
+ }
+}
+
+/// A view for data where the variable length data is less than or equal to 12.
+///
+/// See documentation on [`View`] for details.
+///
+/// # Notes
+/// Note there is no validation done when converting to/from u128
+///
+/// Equality is based on the bitwise value of the view, not the data it
+/// logically points to
+#[derive(Copy, Clone, PartialEq)]
+pub struct InlineView<'a>(&'a u128);
Review Comment:
I wonder if we even need this borrow, `u128` is copy and removing the
indirection might help LLVM not be stupid
##########
arrow-data/src/byte_view.rs:
##########
@@ -15,10 +15,477 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_buffer::Buffer;
+use arrow_buffer::{Buffer, ToByteSlice};
use arrow_schema::ArrowError;
+use std::fmt::Formatter;
+use std::ops::Range;
-#[derive(Debug, Copy, Clone, Default)]
+/// A `View` is a `u128` value that represents a single value in a
+/// [`GenericByteViewArray`].
+///
+/// Depending on the array type, the value may be a utf8 string or simply
bytes.
+/// The layout of the u128 is different depending on the length of the bytes
+/// stored at that location:
+///
+/// # 12 or fewer bytes [`InlineView`]
+///
+/// Values with 12 or fewer bytes are stored directly inlined in the `u128`.
See
+/// [`InlineView`] for field access.
+///
+/// ```text
+///
┌───────────────────────────────────────────┬──────────────┐
+/// │ data │
length │
+/// Strings, len <= 12 │ (padded with \0) │
(u32) │
+/// (InlineView) │ │
│
+///
└───────────────────────────────────────────┴──────────────┘
+/// 127 31
0 bit
+///
offset
+/// ```
+///
+/// # More than 12 bytes [`OffsetView`]
+///
+/// Values with more than 12 bytes store the first 4 bytes inline, an offset
and
+/// buffer index that reference the actual data (including the first 4 bytes)
in
+/// an externally managed buffer. See [`OffsetView`] for field access.
+///
+/// ```text
+///
┌──────────────┬─────────────┬──────────────┬──────────────┐
+/// │buffer offset │ buffer index│ data prefix │
length │
+/// Strings, len > 12 │ (u32) │ (u32) │ (4 bytes) │
(u32) │
+/// (OffsetView) │ │ │ │
│
+///
└──────────────┴─────────────┴──────────────┴──────────────┘
+/// 127 95 63 31
0 bit
+///
offset
+/// ```
+///
+/// See Also:
+/// * [`OwnedView`]: An owned variant of [`View`], used for constructing views
+///
+/// [`GenericByteViewArray`]:
https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum View<'a> {
+ /// Entire string is inlined
+ Inline(InlineView<'a>),
+ /// String is stored in buffer, 4 byte prefix stored inline
+ Offset(OffsetView<'a>),
+}
+
+impl<'a> View<'a> {
+ /// Create a new `View` representing the contents of a `u128`
+ #[inline(always)]
+ pub fn new(v: &'a u128) -> Self {
+ let len = *v as u32;
+ if len <= 12 {
+ Self::Inline(InlineView::from(v))
+ } else {
+ Self::Offset(OffsetView::from(v))
+ }
+ }
+
+ /// Convert the view to a `u128`
+ pub fn to_u128(&self) -> u128 {
+ match self {
+ Self::Inline(inline) => inline.to_u128(),
+ Self::Offset(offset) => offset.to_u128(),
+ }
+ }
+
+ /// Return an [`OwnedView`] representing this view
+ pub fn to_owned(&self) -> OwnedView {
+ OwnedView::new(self.to_u128())
+ }
+}
+
+impl<'a> From<&'a u128> for View<'a> {
+ #[inline(always)]
+ fn from(v: &'a u128) -> Self {
+ Self::new(v)
+ }
+}
+
+/// Owned variant of [`View`] for constructing views from a string or byte
slice.
+///
+/// # Example
+/// ```
+/// # use arrow_data::OwnedView;
+/// // contruct a view from a string
+/// let view = OwnedView::new_from_str("hello");
+/// assert!(matches!(view, OwnedView::Inline(_)));
+/// ```
+///
+/// ```
+/// # use arrow_data::OwnedView;
+/// // contruct a view from a longer string
+/// let view = OwnedView::new_from_str("hello my name is crumple faced fish");
+/// assert!(matches!(view, OwnedView::Offset(_)));
+/// ```
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(PartialEq)]
+pub enum OwnedView {
Review Comment:
What is this type adding, it feels like it isn't entirely necessary?
##########
arrow-data/src/byte_view.rs:
##########
@@ -15,10 +15,477 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_buffer::Buffer;
+use arrow_buffer::{Buffer, ToByteSlice};
use arrow_schema::ArrowError;
+use std::fmt::Formatter;
+use std::ops::Range;
-#[derive(Debug, Copy, Clone, Default)]
+/// A `View` is a `u128` value that represents a single value in a
+/// [`GenericByteViewArray`].
+///
+/// Depending on the array type, the value may be a utf8 string or simply
bytes.
+/// The layout of the u128 is different depending on the length of the bytes
+/// stored at that location:
+///
+/// # 12 or fewer bytes [`InlineView`]
+///
+/// Values with 12 or fewer bytes are stored directly inlined in the `u128`.
See
+/// [`InlineView`] for field access.
+///
+/// ```text
+///
┌───────────────────────────────────────────┬──────────────┐
+/// │ data │
length │
+/// Strings, len <= 12 │ (padded with \0) │
(u32) │
+/// (InlineView) │ │
│
+///
└───────────────────────────────────────────┴──────────────┘
+/// 127 31
0 bit
+///
offset
+/// ```
+///
+/// # More than 12 bytes [`OffsetView`]
+///
+/// Values with more than 12 bytes store the first 4 bytes inline, an offset
and
+/// buffer index that reference the actual data (including the first 4 bytes)
in
+/// an externally managed buffer. See [`OffsetView`] for field access.
+///
+/// ```text
+///
┌──────────────┬─────────────┬──────────────┬──────────────┐
+/// │buffer offset │ buffer index│ data prefix │
length │
+/// Strings, len > 12 │ (u32) │ (u32) │ (4 bytes) │
(u32) │
+/// (OffsetView) │ │ │ │
│
+///
└──────────────┴─────────────┴──────────────┴──────────────┘
+/// 127 95 63 31
0 bit
+///
offset
+/// ```
+///
+/// See Also:
+/// * [`OwnedView`]: An owned variant of [`View`], used for constructing views
+///
+/// [`GenericByteViewArray`]:
https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum View<'a> {
+ /// Entire string is inlined
+ Inline(InlineView<'a>),
+ /// String is stored in buffer, 4 byte prefix stored inline
+ Offset(OffsetView<'a>),
+}
+
+impl<'a> View<'a> {
+ /// Create a new `View` representing the contents of a `u128`
+ #[inline(always)]
+ pub fn new(v: &'a u128) -> Self {
+ let len = *v as u32;
+ if len <= 12 {
+ Self::Inline(InlineView::from(v))
+ } else {
+ Self::Offset(OffsetView::from(v))
+ }
+ }
+
+ /// Convert the view to a `u128`
+ pub fn to_u128(&self) -> u128 {
+ match self {
+ Self::Inline(inline) => inline.to_u128(),
+ Self::Offset(offset) => offset.to_u128(),
+ }
+ }
+
+ /// Return an [`OwnedView`] representing this view
+ pub fn to_owned(&self) -> OwnedView {
+ OwnedView::new(self.to_u128())
+ }
+}
+
+impl<'a> From<&'a u128> for View<'a> {
+ #[inline(always)]
+ fn from(v: &'a u128) -> Self {
+ Self::new(v)
+ }
+}
+
+/// Owned variant of [`View`] for constructing views from a string or byte
slice.
+///
+/// # Example
+/// ```
+/// # use arrow_data::OwnedView;
+/// // contruct a view from a string
+/// let view = OwnedView::new_from_str("hello");
+/// assert!(matches!(view, OwnedView::Inline(_)));
+/// ```
+///
+/// ```
+/// # use arrow_data::OwnedView;
+/// // contruct a view from a longer string
+/// let view = OwnedView::new_from_str("hello my name is crumple faced fish");
+/// assert!(matches!(view, OwnedView::Offset(_)));
+/// ```
+///
+/// # Notes
+/// Equality is based on the bitwise value of the view, not the data it
logically points to
+#[derive(PartialEq)]
+pub enum OwnedView {
+ /// [`InlineView`]: Data is inlined (12 or fewer bytes)
+ Inline(u128),
+ /// [`OffsetView`]: Data is stored in a buffer (more than 12 bytes)
+ Offset(u128),
+}
+
+impl OwnedView {
+ /// Create a new `OwnedView` from a preexisting u128 that represents a
view.
+ ///
+ /// Note no validation is done on the u128 (e.g. no length checking)
+ pub fn new(v: u128) -> Self {
+ let len = v as u32;
+ if len <= 12 {
+ Self::Inline(v)
+ } else {
+ Self::Offset(v)
+ }
+ }
+
+ /// Create a new view from a string
+ ///
+ /// See [`OwnedView::new_from_bytes`] for more details
+ pub fn new_from_str(value: &str) -> Self {
+ Self::new_from_bytes(value.as_bytes())
+ }
+
+ /// Construct an `OwnedView` from a byte slice
+ ///
+ /// This function constructs the appropriate view type to represent this
+ /// value, inlining the value or prefix as appropriate.
+ ///
+ /// # Notes:
+ /// * Does not manage any buffers / offsets
+ /// * A created [`OwnedView::Offset`] has buffer index and offset set to
zero
+ #[inline(always)]
+ pub fn new_from_bytes(v: &[u8]) -> Self {
+ let length: u32 = v.len().try_into().unwrap();
+ let mut view_buffer = [0; 16];
+ view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
+
+ if length <= 12 {
+ // copy all values
+ view_buffer[4..4 + v.len()].copy_from_slice(v);
+ Self::Inline(u128::from_le_bytes(view_buffer))
+ } else {
+ // copy 4 byte prefix
+ view_buffer[4..8].copy_from_slice(&v[0..4]);
+ Self::Offset(u128::from_le_bytes(view_buffer))
+ }
+ }
+
+ // Convert this `OwnedView` to a `View`
+ pub fn as_view(&self) -> View {
+ match self {
+ Self::Inline(inline) => View::Inline(InlineView::from(inline)),
+ Self::Offset(offset) => View::Offset(OffsetView::from(offset)),
+ }
+ }
+}
+
+impl std::fmt::Debug for OwnedView {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ // format with hex bytes
+ match self {
+ Self::Inline(inline) => write!(f,
"OwnedView::Inline({inline:#20x})"),
+ Self::Offset(offset) => write!(f,
"OwnedView::Offset({offset:#20x})"),
+ }
+ }
+}
+
+impl From<&str> for OwnedView {
+ fn from(value: &str) -> Self {
+ Self::new_from_str(value)
+ }
+}
+
+impl From<&[u8]> for OwnedView {
+ fn from(value: &[u8]) -> Self {
+ Self::new_from_bytes(value)
+ }
+}
+
+impl From<u128> for OwnedView {
+ fn from(value: u128) -> Self {
+ Self::new(value)
+ }
+}
+
+/// A view for data where the variable length data is less than or equal to 12.
+///
+/// See documentation on [`View`] for details.
+///
+/// # Notes
+/// Note there is no validation done when converting to/from u128
+///
+/// Equality is based on the bitwise value of the view, not the data it
+/// logically points to
+#[derive(Copy, Clone, PartialEq)]
+pub struct InlineView<'a>(&'a u128);
+
+impl<'a> std::fmt::Debug for InlineView<'a> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ // format with hex bytes
+ write!(f, "InlineView({:#020x})", self.0)
+ }
+}
+
+impl<'a> InlineView<'a> {
+ /// Create a new inline view from a u128
+ #[inline(always)]
+ pub fn new_from_u128(v: &'a u128) -> Self {
+ Self(v)
+ }
+
+ /// Return a reference to the u128
+ pub fn as_u128(self) -> &'a u128 {
+ self.0
Review Comment:
```suggestion
pub fn as_u128(self) -> u128 {
*self.0
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]