This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 47e4b6166 Use Typed Buffers in Arrays (#1811) (#1176) (#3743)
47e4b6166 is described below

commit 47e4b6166d67c50c87d99cd18efd770d5c331918
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Feb 23 09:52:34 2023 +0000

    Use Typed Buffers in Arrays (#1811) (#1176) (#3743)
    
    * Remove RawPtrBox (#1811) (#1176)
    
    * Clippy
    
    * Extract get_offsets function
---
 arrow-array/src/array/boolean_array.rs           | 19 ++----
 arrow-array/src/array/byte_array.rs              | 42 ++++++-------
 arrow-array/src/array/fixed_size_binary_array.rs | 10 ++--
 arrow-array/src/array/fixed_size_list_array.rs   |  5 +-
 arrow-array/src/array/list_array.rs              | 46 ++++-----------
 arrow-array/src/array/map_array.rs               | 27 ++-------
 arrow-array/src/array/mod.rs                     | 27 ++++++++-
 arrow-array/src/array/primitive_array.rs         | 42 ++++---------
 arrow-array/src/lib.rs                           |  1 -
 arrow-array/src/raw_pointer.rs                   | 75 ------------------------
 arrow-array/src/record_batch.rs                  |  2 +-
 arrow-buffer/src/buffer/immutable.rs             | 70 ++++++++++++++++------
 arrow-buffer/src/buffer/mod.rs                   |  2 +
 arrow-buffer/src/buffer/mutable.rs               |  4 +-
 arrow-buffer/src/buffer/offset.rs                | 58 ++++++++++++++++++
 arrow-buffer/src/buffer/scalar.rs                | 71 +++++++++++++---------
 arrow-buffer/src/bytes.rs                        |  2 +-
 17 files changed, 242 insertions(+), 261 deletions(-)

diff --git a/arrow-array/src/array/boolean_array.rs 
b/arrow-array/src/array/boolean_array.rs
index 4c83dcf41..428a721dd 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::array::print_long_array;
 use crate::builder::BooleanBuilder;
 use crate::iterator::BooleanIter;
-use crate::raw_pointer::RawPtrBox;
-use crate::{print_long_array, Array, ArrayAccessor};
+use crate::{Array, ArrayAccessor};
 use arrow_buffer::{bit_util, Buffer, MutableBuffer};
 use arrow_data::bit_mask::combine_option_bitmap;
 use arrow_data::ArrayData;
@@ -67,9 +67,7 @@ use std::any::Any;
 #[derive(Clone)]
 pub struct BooleanArray {
     data: ArrayData,
-    /// Pointer to the value array. The lifetime of this must be <= to the 
value buffer
-    /// stored in `data`, so it's safe to store.
-    raw_values: RawPtrBox<u8>,
+    raw_values: Buffer,
 }
 
 impl std::fmt::Debug for BooleanArray {
@@ -102,7 +100,7 @@ impl BooleanArray {
     ///
     /// Note this doesn't take the offset of this array into account.
     pub fn values(&self) -> &Buffer {
-        &self.data.buffers()[0]
+        &self.raw_values
     }
 
     /// Returns the number of non null, true values within this array
@@ -328,13 +326,8 @@ impl From<ArrayData> for BooleanArray {
             1,
             "BooleanArray data should contain a single buffer only (values 
buffer)"
         );
-        let ptr = data.buffers()[0].as_ptr();
-        Self {
-            data,
-            // SAFETY:
-            // ArrayData must be valid, and validated data type above
-            raw_values: unsafe { RawPtrBox::new(ptr) },
-        }
+        let raw_values = data.buffers()[0].clone();
+        Self { data, raw_values }
     }
 }
 
diff --git a/arrow-array/src/array/byte_array.rs 
b/arrow-array/src/array/byte_array.rs
index 2cb04efb8..f6946228c 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -15,14 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::array::{empty_offsets, print_long_array};
+use crate::array::{get_offsets, print_long_array};
 use crate::builder::GenericByteBuilder;
 use crate::iterator::ArrayIter;
-use crate::raw_pointer::RawPtrBox;
 use crate::types::bytes::ByteArrayNativeType;
 use crate::types::ByteArrayType;
 use crate::{Array, ArrayAccessor, OffsetSizeTrait};
-use arrow_buffer::ArrowNativeType;
+use arrow_buffer::buffer::OffsetBuffer;
+use arrow_buffer::{ArrowNativeType, Buffer};
 use arrow_data::ArrayData;
 use arrow_schema::DataType;
 use std::any::Any;
@@ -39,16 +39,16 @@ use std::any::Any;
 /// [`LargeBinaryArray`]: crate::LargeBinaryArray
 pub struct GenericByteArray<T: ByteArrayType> {
     data: ArrayData,
-    value_offsets: RawPtrBox<T::Offset>,
-    value_data: RawPtrBox<u8>,
+    value_offsets: OffsetBuffer<T::Offset>,
+    value_data: Buffer,
 }
 
 impl<T: ByteArrayType> Clone for GenericByteArray<T> {
     fn clone(&self) -> Self {
         Self {
             data: self.data.clone(),
-            value_offsets: self.value_offsets,
-            value_data: self.value_data,
+            value_offsets: self.value_offsets.clone(),
+            value_data: self.value_data.clone(),
         }
     }
 }
@@ -68,7 +68,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {
 
     /// Returns the raw value data
     pub fn value_data(&self) -> &[u8] {
-        self.data.buffers()[1].as_slice()
+        self.value_data.as_slice()
     }
 
     /// Returns true if all data within this array is ASCII
@@ -82,15 +82,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {
     /// Returns the offset values in the offsets buffer
     #[inline]
     pub fn value_offsets(&self) -> &[T::Offset] {
-        // Soundness
-        //     pointer alignment & location is ensured by RawPtrBox
-        //     buffer bounds/offset is ensured by the ArrayData instance.
-        unsafe {
-            std::slice::from_raw_parts(
-                self.value_offsets.as_ptr().add(self.data.offset()),
-                self.len() + 1,
-            )
-        }
+        &self.value_offsets
     }
 
     /// Returns the element at index `i`
@@ -161,6 +153,8 @@ impl<T: ByteArrayType> GenericByteArray<T> {
             .slice_with_length(self.data.offset() * element_len, value_len * 
element_len);
 
         drop(self.data);
+        drop(self.value_data);
+        drop(self.value_offsets);
 
         let try_mutable_null_buffer = match null_bit_buffer {
             None => Ok(None),
@@ -280,18 +274,16 @@ impl<T: ByteArrayType> From<ArrayData> for 
GenericByteArray<T> {
             T::Offset::PREFIX,
             T::PREFIX,
         );
-        // Handle case of empty offsets
-        let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
-            true => empty_offsets::<T::Offset>().as_ptr() as *const _,
-            false => data.buffers()[0].as_ptr(),
-        };
-        let values = data.buffers()[1].as_ptr();
+        // SAFETY:
+        // ArrayData is valid, and verified type above
+        let value_offsets = unsafe { get_offsets(&data) };
+        let value_data = data.buffers()[1].clone();
         Self {
             data,
             // SAFETY:
             // ArrayData must be valid, and validated data type above
-            value_offsets: unsafe { RawPtrBox::new(offsets) },
-            value_data: unsafe { RawPtrBox::new(values) },
+            value_offsets,
+            value_data,
         }
     }
 }
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs 
b/arrow-array/src/array/fixed_size_binary_array.rs
index 936fb3025..89ace430d 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::array::print_long_array;
 use crate::iterator::FixedSizeBinaryIter;
-use crate::raw_pointer::RawPtrBox;
-use crate::{print_long_array, Array, ArrayAccessor, FixedSizeListArray};
+use crate::{Array, ArrayAccessor, FixedSizeListArray};
 use arrow_buffer::{bit_util, Buffer, MutableBuffer};
 use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType};
@@ -50,7 +50,7 @@ use std::any::Any;
 #[derive(Clone)]
 pub struct FixedSizeBinaryArray {
     data: ArrayData,
-    value_data: RawPtrBox<u8>,
+    value_data: Buffer,
     length: i32,
 }
 
@@ -357,14 +357,14 @@ impl From<ArrayData> for FixedSizeBinaryArray {
             1,
             "FixedSizeBinaryArray data should contain 1 buffer only (values)"
         );
-        let value_data = data.buffers()[0].as_ptr();
+        let value_data = data.buffers()[0].clone();
         let length = match data.data_type() {
             DataType::FixedSizeBinary(len) => *len,
             _ => panic!("Expected data type to be FixedSizeBinary"),
         };
         Self {
             data,
-            value_data: unsafe { RawPtrBox::new(value_data) },
+            value_data,
             length,
         }
     }
diff --git a/arrow-array/src/array/fixed_size_list_array.rs 
b/arrow-array/src/array/fixed_size_list_array.rs
index c361d2d44..6e228ba3c 100644
--- a/arrow-array/src/array/fixed_size_list_array.rs
+++ b/arrow-array/src/array/fixed_size_list_array.rs
@@ -15,10 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::array::print_long_array;
 use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder};
-use crate::{
-    make_array, print_long_array, Array, ArrayAccessor, ArrayRef, 
ArrowPrimitiveType,
-};
+use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType};
 use arrow_data::ArrayData;
 use arrow_schema::DataType;
 use std::any::Any;
diff --git a/arrow-array/src/array/list_array.rs 
b/arrow-array/src/array/list_array.rs
index b378549eb..6b63269d1 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::array::make_array;
+use crate::array::{get_offsets, make_array, print_long_array};
 use crate::builder::{GenericListBuilder, PrimitiveBuilder};
 use crate::{
-    iterator::GenericListArrayIter, print_long_array, raw_pointer::RawPtrBox, 
Array,
-    ArrayAccessor, ArrayRef, ArrowPrimitiveType,
+    iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, 
ArrowPrimitiveType,
 };
+use arrow_buffer::buffer::OffsetBuffer;
 use arrow_buffer::ArrowNativeType;
 use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType, Field};
@@ -45,35 +45,24 @@ impl OffsetSizeTrait for i64 {
     const PREFIX: &'static str = "Large";
 }
 
-/// Returns a slice of `OffsetSize` consisting of a single zero value
-#[inline]
-pub(crate) fn empty_offsets<OffsetSize: OffsetSizeTrait>() -> &'static 
[OffsetSize] {
-    static OFFSET: &[i64] = &[0];
-    // SAFETY:
-    // OffsetSize is ArrowNativeType and is therefore trivially transmutable
-    let (prefix, val, suffix) = unsafe { OFFSET.align_to::<OffsetSize>() };
-    assert!(prefix.is_empty() && suffix.is_empty());
-    val
-}
-
 /// Generic struct for a variable-size list array.
 ///
 /// Columnar format in Apache Arrow:
 /// 
<https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout>
 ///
 /// For non generic lists, you may wish to consider using [`ListArray`] or 
[`LargeListArray`]`
-pub struct GenericListArray<OffsetSize> {
+pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
     data: ArrayData,
     values: ArrayRef,
-    value_offsets: RawPtrBox<OffsetSize>,
+    value_offsets: OffsetBuffer<OffsetSize>,
 }
 
-impl<OffsetSize> Clone for GenericListArray<OffsetSize> {
+impl<OffsetSize: OffsetSizeTrait> Clone for GenericListArray<OffsetSize> {
     fn clone(&self) -> Self {
         Self {
             data: self.data.clone(),
             values: self.values.clone(),
-            value_offsets: self.value_offsets,
+            value_offsets: self.value_offsets.clone(),
         }
     }
 }
@@ -118,15 +107,7 @@ impl<OffsetSize: OffsetSizeTrait> 
GenericListArray<OffsetSize> {
     /// Returns the offset values in the offsets buffer
     #[inline]
     pub fn value_offsets(&self) -> &[OffsetSize] {
-        // Soundness
-        //     pointer alignment & location is ensured by RawPtrBox
-        //     buffer bounds/offset is ensured by the ArrayData instance.
-        unsafe {
-            std::slice::from_raw_parts(
-                self.value_offsets.as_ptr().add(self.data.offset()),
-                self.len() + 1,
-            )
-        }
+        &self.value_offsets
     }
 
     /// Returns the length for value at index `i`.
@@ -242,15 +223,10 @@ impl<OffsetSize: OffsetSizeTrait> 
GenericListArray<OffsetSize> {
         }
 
         let values = make_array(values);
-        // Handle case of empty offsets
-        let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
-            true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
-            false => data.buffers()[0].as_ptr(),
-        };
-
         // SAFETY:
-        // Verified list type in call to `Self::get_type`
-        let value_offsets = unsafe { RawPtrBox::new(offsets) };
+        // ArrayData is valid, and verified type above
+        let value_offsets = unsafe { get_offsets(&data) };
+
         Ok(Self {
             data,
             values,
diff --git a/arrow-array/src/array/map_array.rs 
b/arrow-array/src/array/map_array.rs
index b0eb4a3c9..8c9b02921 100644
--- a/arrow-array/src/array/map_array.rs
+++ b/arrow-array/src/array/map_array.rs
@@ -15,8 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::raw_pointer::RawPtrBox;
-use crate::{make_array, print_long_array, Array, ArrayRef, StringArray, 
StructArray};
+use crate::array::{get_offsets, print_long_array};
+use crate::{make_array, Array, ArrayRef, StringArray, StructArray};
+use arrow_buffer::buffer::OffsetBuffer;
 use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice};
 use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType, Field};
@@ -38,7 +39,7 @@ pub struct MapArray {
     /// The second child of `entries`, the "values" of this MapArray
     values: ArrayRef,
     /// The start and end offsets of each entry
-    value_offsets: RawPtrBox<i32>,
+    value_offsets: OffsetBuffer<i32>,
 }
 
 impl MapArray {
@@ -86,15 +87,7 @@ impl MapArray {
     /// Returns the offset values in the offsets buffer
     #[inline]
     pub fn value_offsets(&self) -> &[i32] {
-        // Soundness
-        //     pointer alignment & location is ensured by RawPtrBox
-        //     buffer bounds/offset is ensured by the ArrayData instance.
-        unsafe {
-            std::slice::from_raw_parts(
-                self.value_offsets.as_ptr().add(self.data.offset()),
-                self.len() + 1,
-            )
-        }
+        &self.value_offsets
     }
 
     /// Returns the length for value at index `i`.
@@ -159,18 +152,10 @@ impl MapArray {
         let keys = make_array(entries.child_data()[0].clone());
         let values = make_array(entries.child_data()[1].clone());
         let entries = make_array(entries);
-        let value_offsets = data.buffers()[0].as_ptr();
 
         // SAFETY:
         // ArrayData is valid, and verified type above
-        let value_offsets = unsafe { RawPtrBox::<i32>::new(value_offsets) };
-        unsafe {
-            if (*value_offsets.as_ptr().offset(0)) != 0 {
-                return Err(ArrowError::InvalidArgumentError(String::from(
-                    "offsets do not start at zero",
-                )));
-            }
-        }
+        let value_offsets = unsafe { get_offsets(&data) };
 
         Ok(Self {
             data,
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index b293d797e..27973a40f 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -20,6 +20,8 @@
 mod binary_array;
 
 use crate::types::*;
+use arrow_buffer::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow_buffer::ArrowNativeType;
 use arrow_data::ArrayData;
 use arrow_schema::{DataType, IntervalUnit, TimeUnit};
 use std::any::Any;
@@ -636,8 +638,29 @@ pub fn new_null_array(data_type: &DataType, length: usize) 
-> ArrayRef {
     make_array(ArrayData::new_null(data_type, length))
 }
 
-// Helper function for printing potentially long arrays.
-pub(crate) fn print_long_array<A, F>(
+/// Helper function that gets offset from an [`ArrayData`]
+///
+/// # Safety
+///
+/// - ArrayData must contain a valid [`OffsetBuffer`] as its first buffer
+unsafe fn get_offsets<O: ArrowNativeType>(data: &ArrayData) -> OffsetBuffer<O> 
{
+    match data.is_empty() && data.buffers()[0].is_empty() {
+        true => OffsetBuffer::new_empty(),
+        false => {
+            let buffer = ScalarBuffer::new(
+                data.buffers()[0].clone(),
+                data.offset(),
+                data.len() + 1,
+            );
+            // Safety:
+            // ArrayData is valid
+            unsafe { OffsetBuffer::new_unchecked(buffer) }
+        }
+    }
+}
+
+/// Helper function for printing potentially long arrays.
+fn print_long_array<A, F>(
     array: &A,
     f: &mut std::fmt::Formatter,
     print_item: F,
diff --git a/arrow-array/src/array/primitive_array.rs 
b/arrow-array/src/array/primitive_array.rs
index b64534e98..53217a06f 100644
--- a/arrow-array/src/array/primitive_array.rs
+++ b/arrow-array/src/array/primitive_array.rs
@@ -15,16 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::array::print_long_array;
 use crate::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder};
 use crate::iterator::PrimitiveIter;
-use crate::raw_pointer::RawPtrBox;
 use crate::temporal_conversions::{
     as_date, as_datetime, as_datetime_with_timezone, as_duration, as_time,
 };
 use crate::timezone::Tz;
 use crate::trusted_len::trusted_len_unzip;
-use crate::{print_long_array, Array, ArrayAccessor};
 use crate::{types::*, ArrowNativeTypeOp};
+use crate::{Array, ArrayAccessor};
+use arrow_buffer::buffer::ScalarBuffer;
 use arrow_buffer::{i256, ArrowNativeType, Buffer};
 use arrow_data::bit_iterator::try_for_each_valid_idx;
 use arrow_data::ArrayData;
@@ -266,22 +267,16 @@ pub trait ArrowPrimitiveType: 'static {
 /// ```
 pub struct PrimitiveArray<T: ArrowPrimitiveType> {
     /// Underlying ArrayData
-    /// # Safety
-    /// must have exactly one buffer, aligned to type T
     data: ArrayData,
-    /// Pointer to the value array. The lifetime of this must be <= to the 
value buffer
-    /// stored in `data`, so it's safe to store.
-    /// # Safety
-    /// raw_values must have a value equivalent to 
`data.buffers()[0].raw_data()`
-    /// raw_values must have alignment for type T::NativeType
-    raw_values: RawPtrBox<T::Native>,
+    /// Values data
+    raw_values: ScalarBuffer<T::Native>,
 }
 
 impl<T: ArrowPrimitiveType> Clone for PrimitiveArray<T> {
     fn clone(&self) -> Self {
         Self {
             data: self.data.clone(),
-            raw_values: self.raw_values,
+            raw_values: self.raw_values.clone(),
         }
     }
 }
@@ -301,15 +296,7 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
     /// Returns a slice of the values of this array
     #[inline]
     pub fn values(&self) -> &[T::Native] {
-        // Soundness
-        //     raw_values alignment & location is ensured by fn 
from(ArrayDataRef)
-        //     buffer bounds/offset is ensured by the ArrayData instance.
-        unsafe {
-            std::slice::from_raw_parts(
-                self.raw_values.as_ptr().add(self.data.offset()),
-                self.len(),
-            )
-        }
+        &self.raw_values
     }
 
     /// Returns a new primitive array builder
@@ -339,8 +326,7 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
     /// caller must ensure that the passed in offset is less than the array 
len()
     #[inline]
     pub unsafe fn value_unchecked(&self, i: usize) -> T::Native {
-        let offset = i + self.offset();
-        *self.raw_values.as_ptr().add(offset)
+        *self.raw_values.get_unchecked(i)
     }
 
     /// Returns the primitive value at index `i`.
@@ -632,6 +618,7 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
             .slice_with_length(self.data.offset() * element_len, len * 
element_len);
 
         drop(self.data);
+        drop(self.raw_values);
 
         let try_mutable_null_buffer = match null_bit_buffer {
             None => Ok(None),
@@ -724,6 +711,7 @@ impl<'a, T: ArrowPrimitiveType> ArrayAccessor for &'a 
PrimitiveArray<T> {
         PrimitiveArray::value(self, index)
     }
 
+    #[inline]
     unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
         PrimitiveArray::value_unchecked(self, index)
     }
@@ -1085,13 +1073,9 @@ impl<T: ArrowPrimitiveType> From<ArrayData> for 
PrimitiveArray<T> {
             "PrimitiveArray data should contain a single buffer only (values 
buffer)"
         );
 
-        let ptr = data.buffers()[0].as_ptr();
-        Self {
-            data,
-            // SAFETY:
-            // ArrayData must be valid, and validated data type above
-            raw_values: unsafe { RawPtrBox::new(ptr) },
-        }
+        let raw_values =
+            ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), 
data.len());
+        Self { data, raw_values }
     }
 }
 
diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs
index 2cee2650e..400b6e262 100644
--- a/arrow-array/src/lib.rs
+++ b/arrow-array/src/lib.rs
@@ -179,7 +179,6 @@ pub mod builder;
 pub mod cast;
 mod delta;
 pub mod iterator;
-mod raw_pointer;
 pub mod run_iterator;
 pub mod temporal_conversions;
 pub mod timezone;
diff --git a/arrow-array/src/raw_pointer.rs b/arrow-array/src/raw_pointer.rs
deleted file mode 100644
index 0fea8c186..000000000
--- a/arrow-array/src/raw_pointer.rs
+++ /dev/null
@@ -1,75 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::ptr::NonNull;
-
-/// This struct is highly `unsafe` and offers the possibility to
-/// self-reference a [arrow_buffer::Buffer] from
-/// [arrow_data::ArrayData], as a pointer to the beginning of its
-/// contents.
-pub(super) struct RawPtrBox<T> {
-    ptr: NonNull<T>,
-}
-
-impl<T> Clone for RawPtrBox<T> {
-    fn clone(&self) -> Self {
-        Self { ptr: self.ptr }
-    }
-}
-
-impl<T> Copy for RawPtrBox<T> {}
-
-impl<T> RawPtrBox<T> {
-    /// # Safety
-    /// The user must guarantee that:
-    /// * the contents where `ptr` points to are never `moved`. This is 
guaranteed when they are Pinned.
-    /// * the lifetime of this struct does not outlive the lifetime of `ptr`.
-    /// Failure to fulfill any the above conditions results in undefined 
behavior.
-    /// # Panic
-    /// This function panics if:
-    /// * `ptr` is null
-    /// * `ptr` is not aligned to a slice of type `T`. This is guaranteed if 
it was built from a slice of type `T`.
-    pub(super) unsafe fn new(ptr: *const u8) -> Self {
-        let ptr = NonNull::new(ptr as *mut u8).expect("Pointer cannot be 
null");
-        assert_eq!(
-            ptr.as_ptr().align_offset(std::mem::align_of::<T>()),
-            0,
-            "memory is not aligned"
-        );
-        Self { ptr: ptr.cast() }
-    }
-
-    pub(super) fn as_ptr(&self) -> *const T {
-        self.ptr.as_ptr()
-    }
-}
-
-unsafe impl<T> Send for RawPtrBox<T> {}
-unsafe impl<T> Sync for RawPtrBox<T> {}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    #[should_panic(expected = "memory is not aligned")]
-    #[cfg_attr(miri, ignore)] // sometimes does not panic as expected
-    fn test_primitive_array_alignment() {
-        let bytes = vec![0u8, 1u8];
-        unsafe { RawPtrBox::<u64>::new(bytes.as_ptr().offset(1)) };
-    }
-}
diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs
index 3b517872a..04a559f21 100644
--- a/arrow-array/src/record_batch.rs
+++ b/arrow-array/src/record_batch.rs
@@ -603,7 +603,7 @@ mod tests {
         let record_batch =
             RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), 
Arc::new(b)])
                 .unwrap();
-        assert_eq!(record_batch.get_array_memory_size(), 592);
+        assert_eq!(record_batch.get_array_memory_size(), 640);
     }
 
     fn check_batch(record_batch: RecordBatch, num_rows: usize) {
diff --git a/arrow-buffer/src/buffer/immutable.rs 
b/arrow-buffer/src/buffer/immutable.rs
index 4048787c6..cbfba1e05 100644
--- a/arrow-buffer/src/buffer/immutable.rs
+++ b/arrow-buffer/src/buffer/immutable.rs
@@ -15,11 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::convert::AsRef;
 use std::fmt::Debug;
 use std::iter::FromIterator;
 use std::ptr::NonNull;
 use std::sync::Arc;
-use std::{convert::AsRef, usize};
 
 use crate::alloc::{Allocation, Deallocation};
 use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk};
@@ -30,26 +30,41 @@ use super::MutableBuffer;
 
 /// Buffer represents a contiguous memory region that can be shared with other 
buffers and across
 /// thread boundaries.
-#[derive(Clone, PartialEq, Debug)]
+#[derive(Clone, Debug)]
 pub struct Buffer {
     /// the internal byte buffer.
     data: Arc<Bytes>,
 
-    /// The offset into the buffer.
-    offset: usize,
+    /// Pointer into `data` valid
+    ///
+    /// We store a pointer instead of an offset to avoid pointer arithmetic
+    /// which causes LLVM to fail to vectorise code correctly
+    ptr: *const u8,
 
     /// Byte length of the buffer.
     length: usize,
 }
 
+impl PartialEq for Buffer {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_slice().eq(other.as_slice())
+    }
+}
+
+impl Eq for Buffer {}
+
+unsafe impl Send for Buffer where Bytes: Send {}
+unsafe impl Sync for Buffer where Bytes: Sync {}
+
 impl Buffer {
     /// Auxiliary method to create a new Buffer
     #[inline]
     pub fn from_bytes(bytes: Bytes) -> Self {
         let length = bytes.len();
+        let ptr = bytes.as_ptr();
         Buffer {
             data: Arc::new(bytes),
-            offset: 0,
+            ptr,
             length,
         }
     }
@@ -108,9 +123,10 @@ impl Buffer {
         deallocation: Deallocation,
     ) -> Self {
         let bytes = Bytes::new(ptr, len, deallocation);
+        let ptr = bytes.as_ptr();
         Buffer {
+            ptr,
             data: Arc::new(bytes),
-            offset: 0,
             length: len,
         }
     }
@@ -136,7 +152,7 @@ impl Buffer {
 
     /// Returns the byte slice stored in this buffer
     pub fn as_slice(&self) -> &[u8] {
-        &self.data[self.offset..(self.offset + self.length)]
+        unsafe { std::slice::from_raw_parts(self.ptr, self.length) }
     }
 
     /// Returns a new [Buffer] that is a slice of this buffer starting at 
`offset`.
@@ -145,13 +161,18 @@ impl Buffer {
     /// Panics iff `offset` is larger than `len`.
     pub fn slice(&self, offset: usize) -> Self {
         assert!(
-            offset <= self.len(),
+            offset <= self.length,
             "the offset of the new Buffer cannot exceed the existing length"
         );
+        // Safety:
+        // This cannot overflow as
+        // `self.offset + self.length < self.data.len()`
+        // `offset < self.length`
+        let ptr = unsafe { self.ptr.add(offset) };
         Self {
             data: self.data.clone(),
-            offset: self.offset + offset,
             length: self.length - offset,
+            ptr,
         }
     }
 
@@ -162,12 +183,15 @@ impl Buffer {
     /// Panics iff `(offset + length)` is larger than the existing length.
     pub fn slice_with_length(&self, offset: usize, length: usize) -> Self {
         assert!(
-            offset + length <= self.len(),
+            offset.saturating_add(length) <= self.length,
             "the offset of the new Buffer cannot exceed the existing length"
         );
+        // Safety:
+        // offset + length <= self.length
+        let ptr = unsafe { self.ptr.add(offset) };
         Self {
             data: self.data.clone(),
-            offset: self.offset + offset,
+            ptr,
             length,
         }
     }
@@ -178,7 +202,7 @@ impl Buffer {
     /// stored anywhere, to avoid dangling pointers.
     #[inline]
     pub fn as_ptr(&self) -> *const u8 {
-        unsafe { self.data.ptr().as_ptr().add(self.offset) }
+        self.ptr
     }
 
     /// View buffer as a slice of a specific type.
@@ -231,18 +255,17 @@ impl Buffer {
     /// Returns `MutableBuffer` for mutating the buffer if this buffer is not 
shared.
     /// Returns `Err` if this is shared or its allocation is from an external 
source.
     pub fn into_mutable(self) -> Result<MutableBuffer, Self> {
-        let offset_ptr = self.as_ptr();
-        let offset = self.offset;
+        let ptr = self.ptr;
         let length = self.length;
         Arc::try_unwrap(self.data)
             .and_then(|bytes| {
                 // The pointer of underlying buffer should not be offset.
-                assert_eq!(offset_ptr, bytes.ptr().as_ptr());
+                assert_eq!(ptr, bytes.ptr().as_ptr());
                 MutableBuffer::from_bytes(bytes).map_err(Arc::new)
             })
             .map_err(|bytes| Buffer {
                 data: bytes,
-                offset,
+                ptr,
                 length,
             })
     }
@@ -262,7 +285,7 @@ impl<T: AsRef<[u8]>> From<T> for Buffer {
 }
 
 /// Creating a `Buffer` instance by storing the boolean values into the buffer
-impl std::iter::FromIterator<bool> for Buffer {
+impl FromIterator<bool> for Buffer {
     fn from_iter<I>(iter: I) -> Self
     where
         I: IntoIterator<Item = bool>,
@@ -321,10 +344,10 @@ impl Buffer {
     pub unsafe fn try_from_trusted_len_iter<
         E,
         T: ArrowNativeType,
-        I: Iterator<Item = std::result::Result<T, E>>,
+        I: Iterator<Item = Result<T, E>>,
     >(
         iterator: I,
-    ) -> std::result::Result<Self, E> {
+    ) -> Result<Self, E> {
         Ok(MutableBuffer::try_from_trusted_len_iter(iterator)?.into())
     }
 }
@@ -600,4 +623,13 @@ mod tests {
         let slice = buffer.typed_data::<i32>();
         assert_eq!(slice, &[2, 3, 4, 5]);
     }
+
+    #[test]
+    #[should_panic(
+        expected = "the offset of the new Buffer cannot exceed the existing 
length"
+    )]
+    fn slice_overflow() {
+        let buffer = Buffer::from(MutableBuffer::from_len_zeroed(12));
+        buffer.slice_with_length(2, usize::MAX);
+    }
 }
diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs
index b9201f774..7c12e1804 100644
--- a/arrow-buffer/src/buffer/mod.rs
+++ b/arrow-buffer/src/buffer/mod.rs
@@ -18,6 +18,8 @@
 //! This module contains two main structs: [Buffer] and [MutableBuffer]. A 
buffer represents
 //! a contiguous memory region that can be shared via `offsets`.
 
+mod offset;
+pub use offset::*;
 mod immutable;
 pub use immutable::*;
 mod mutable;
diff --git a/arrow-buffer/src/buffer/mutable.rs 
b/arrow-buffer/src/buffer/mutable.rs
index b70a74e84..2e6e2f1d7 100644
--- a/arrow-buffer/src/buffer/mutable.rs
+++ b/arrow-buffer/src/buffer/mutable.rs
@@ -581,10 +581,10 @@ impl MutableBuffer {
     pub unsafe fn try_from_trusted_len_iter<
         E,
         T: ArrowNativeType,
-        I: Iterator<Item = std::result::Result<T, E>>,
+        I: Iterator<Item = Result<T, E>>,
     >(
         iterator: I,
-    ) -> std::result::Result<Self, E> {
+    ) -> Result<Self, E> {
         let item_size = std::mem::size_of::<T>();
         let (_, upper) = iterator.size_hint();
         let upper = upper.expect("try_from_trusted_len_iter requires an upper 
limit");
diff --git a/arrow-buffer/src/buffer/offset.rs 
b/arrow-buffer/src/buffer/offset.rs
new file mode 100644
index 000000000..a80c3c7ec
--- /dev/null
+++ b/arrow-buffer/src/buffer/offset.rs
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::buffer::ScalarBuffer;
+use crate::{ArrowNativeType, MutableBuffer};
+use std::ops::Deref;
+
+/// A non-empty buffer of monotonically increasing, positive integers
+#[derive(Debug, Clone)]
+pub struct OffsetBuffer<O: ArrowNativeType>(ScalarBuffer<O>);
+
+impl<O: ArrowNativeType> OffsetBuffer<O> {
+    /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
+    ///
+    /// # Safety
+    ///
+    /// `buffer` must be a non-empty buffer containing monotonically increasing
+    /// values greater than zero
+    pub unsafe fn new_unchecked(buffer: ScalarBuffer<O>) -> Self {
+        Self(buffer)
+    }
+
+    /// Create a new [`OffsetBuffer`] containing a single 0 value
+    pub fn new_empty() -> Self {
+        let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::<O>());
+        Self(buffer.into_buffer().into())
+    }
+}
+
+impl<T: ArrowNativeType> Deref for OffsetBuffer<T> {
+    type Target = [T];
+
+    #[inline]
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: ArrowNativeType> AsRef<[T]> for OffsetBuffer<T> {
+    #[inline]
+    fn as_ref(&self) -> &[T] {
+        self
+    }
+}
diff --git a/arrow-buffer/src/buffer/scalar.rs 
b/arrow-buffer/src/buffer/scalar.rs
index 124f3f6f5..e688e52fe 100644
--- a/arrow-buffer/src/buffer/scalar.rs
+++ b/arrow-buffer/src/buffer/scalar.rs
@@ -17,6 +17,7 @@
 
 use crate::buffer::Buffer;
 use crate::native::ArrowNativeType;
+use std::marker::PhantomData;
 use std::ops::Deref;
 
 /// Provides a safe API for interpreting a [`Buffer`] as a slice of 
[`ArrowNativeType`]
@@ -25,14 +26,11 @@ use std::ops::Deref;
 ///
 /// All [`ArrowNativeType`] are valid for all possible backing byte 
representations, and as
 /// a result they are "trivially safely transmutable".
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct ScalarBuffer<T: ArrowNativeType> {
-    #[allow(unused)]
+    /// Underlying data buffer
     buffer: Buffer,
-    // Borrows from `buffer` and is valid for the lifetime of `buffer`
-    ptr: *const T,
-    // The length of this slice
-    len: usize,
+    phantom: PhantomData<T>,
 }
 
 impl<T: ArrowNativeType> ScalarBuffer<T> {
@@ -48,39 +46,50 @@ impl<T: ArrowNativeType> ScalarBuffer<T> {
     /// * `bytes` is not large enough for the requested slice
     pub fn new(buffer: Buffer, offset: usize, len: usize) -> Self {
         let size = std::mem::size_of::<T>();
-        let offset_len = offset.checked_add(len).expect("length overflow");
-        let start_bytes = offset.checked_mul(size).expect("start bytes 
overflow");
-        let end_bytes = offset_len.checked_mul(size).expect("end bytes 
overflow");
-
-        let bytes = &buffer.as_slice()[start_bytes..end_bytes];
-
-        // SAFETY: all byte sequences correspond to a valid instance of T
-        let (prefix, offsets, suffix) = unsafe { bytes.align_to::<T>() };
-        assert!(
-            prefix.is_empty() && suffix.is_empty(),
-            "buffer is not aligned to {size} byte boundary"
-        );
-
-        let ptr = offsets.as_ptr();
-        Self { buffer, ptr, len }
+        let byte_offset = offset.checked_mul(size).expect("offset overflow");
+        let byte_len = len.checked_mul(size).expect("length overflow");
+        buffer.slice_with_length(byte_offset, byte_len).into()
     }
 }
 
 impl<T: ArrowNativeType> Deref for ScalarBuffer<T> {
     type Target = [T];
 
+    #[inline]
     fn deref(&self) -> &Self::Target {
-        // SAFETY: Bounds checked in constructor and ptr is valid for the 
lifetime of self
-        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
+        // SAFETY: Verified alignment in From<Buffer>
+        unsafe {
+            std::slice::from_raw_parts(
+                self.buffer.as_ptr() as *const T,
+                self.buffer.len() / std::mem::size_of::<T>(),
+            )
+        }
     }
 }
 
 impl<T: ArrowNativeType> AsRef<[T]> for ScalarBuffer<T> {
+    #[inline]
     fn as_ref(&self) -> &[T] {
         self
     }
 }
 
+impl<T: ArrowNativeType> From<Buffer> for ScalarBuffer<T> {
+    fn from(buffer: Buffer) -> Self {
+        let align = std::mem::align_of::<T>();
+        assert_eq!(
+            buffer.as_ptr().align_offset(align),
+            0,
+            "memory is not aligned"
+        );
+
+        Self {
+            buffer,
+            phantom: Default::default(),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -103,7 +112,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "buffer is not aligned to 4 byte boundary")]
+    #[should_panic(expected = "memory is not aligned")]
     fn test_unaligned() {
         let expected = [0_i32, 1, 2];
         let buffer = Buffer::from_iter(expected.iter().cloned());
@@ -112,35 +121,39 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "range end index 16 out of range for slice of 
length 12")]
+    #[should_panic(
+        expected = "the offset of the new Buffer cannot exceed the existing 
length"
+    )]
     fn test_length_out_of_bounds() {
         let buffer = Buffer::from_iter([0_i32, 1, 2]);
         ScalarBuffer::<i32>::new(buffer, 1, 3);
     }
 
     #[test]
-    #[should_panic(expected = "range end index 16 out of range for slice of 
length 12")]
+    #[should_panic(
+        expected = "the offset of the new Buffer cannot exceed the existing 
length"
+    )]
     fn test_offset_out_of_bounds() {
         let buffer = Buffer::from_iter([0_i32, 1, 2]);
         ScalarBuffer::<i32>::new(buffer, 4, 0);
     }
 
     #[test]
-    #[should_panic(expected = "length overflow")]
+    #[should_panic(expected = "offset overflow")]
     fn test_length_overflow() {
         let buffer = Buffer::from_iter([0_i32, 1, 2]);
         ScalarBuffer::<i32>::new(buffer, usize::MAX, 1);
     }
 
     #[test]
-    #[should_panic(expected = "start bytes overflow")]
+    #[should_panic(expected = "offset overflow")]
     fn test_start_overflow() {
         let buffer = Buffer::from_iter([0_i32, 1, 2]);
         ScalarBuffer::<i32>::new(buffer, usize::MAX / 4 + 1, 0);
     }
 
     #[test]
-    #[should_panic(expected = "end bytes overflow")]
+    #[should_panic(expected = "length overflow")]
     fn test_end_overflow() {
         let buffer = Buffer::from_iter([0_i32, 1, 2]);
         ScalarBuffer::<i32>::new(buffer, 0, usize::MAX / 4 + 1);
diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs
index fea04ad0d..3320dfc26 100644
--- a/arrow-buffer/src/bytes.rs
+++ b/arrow-buffer/src/bytes.rs
@@ -61,7 +61,7 @@ impl Bytes {
     /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is 
guaranteed.
     #[inline]
     pub(crate) unsafe fn new(
-        ptr: std::ptr::NonNull<u8>,
+        ptr: NonNull<u8>,
         len: usize,
         deallocation: Deallocation,
     ) -> Bytes {


Reply via email to