This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9d60852 ARROW-8881: [Rust] Add large binary, string and list support
9d60852 is described below
commit 9d60852e7987f6f3dde4273c0343d326e6cc8dfa
Author: Neville Dipale <[email protected]>
AuthorDate: Sun Jul 5 18:43:06 2020 +0200
ARROW-8881: [Rust] Add large binary, string and list support
Similar to other implementations, this creates binary, string and list
arrays with `i64` offsets instead of `i32`. Behaviourally, everything's the
same as the `i32` counterparts, except for the larger array offsets.
I'll look at the relevant integration tests separately.
Code's a bit repetitive, but given that we currently don't have much review
bandwidth; I'd rather not spend a lot of time on it now, as I want to try
complete other integration-related tasks in time for 1.0.0.
Closes #7613 from nevi-me/rust-large-lists
Authored-by: Neville Dipale <[email protected]>
Signed-off-by: Neville Dipale <[email protected]>
---
rust/arrow/src/array/array.rs | 756 ++++++++++++++++++---
rust/arrow/src/array/builder.rs | 533 +++++++++++++++
rust/arrow/src/array/equal.rs | 481 +++++++++++++
rust/arrow/src/array/mod.rs | 7 +
rust/arrow/src/datatypes.rs | 124 +++-
rust/arrow/src/ipc/convert.rs | 48 ++
rust/arrow/src/ipc/reader.rs | 6 +-
.../src/bin/arrow-json-integration-test.rs | 43 +-
rust/parquet/src/arrow/schema.rs | 3 +
9 files changed, 1866 insertions(+), 135 deletions(-)
diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index 18e11ce..c9eec1e 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -266,11 +266,14 @@ pub fn make_array(data: ArrayDataRef) -> ArrayRef {
Arc::new(DurationNanosecondArray::from(data)) as ArrayRef
}
DataType::Binary => Arc::new(BinaryArray::from(data)) as ArrayRef,
+ DataType::LargeBinary => Arc::new(LargeBinaryArray::from(data)) as
ArrayRef,
DataType::FixedSizeBinary(_) => {
Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef
}
DataType::Utf8 => Arc::new(StringArray::from(data)) as ArrayRef,
+ DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as
ArrayRef,
DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef,
+ DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as
ArrayRef,
DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef,
DataType::Union(_) => Arc::new(UnionArray::from(data)) as ArrayRef,
DataType::FixedSizeList(_, _) => {
@@ -951,6 +954,30 @@ impl ListArrayOps for FixedSizeBinaryArray {
}
}
+/// Common operations for large List types, currently `LargeListArray`,
`LargeBinaryArray`
+/// and `LargeStringArray`
+pub trait LargeListArrayOps {
+ fn value_offset_at(&self, i: usize) -> i64;
+}
+
+impl LargeListArrayOps for LargeBinaryArray {
+ fn value_offset_at(&self, i: usize) -> i64 {
+ self.value_offset_at(i)
+ }
+}
+
+impl LargeListArrayOps for LargeStringArray {
+ fn value_offset_at(&self, i: usize) -> i64 {
+ self.value_offset_at(i)
+ }
+}
+
+impl LargeListArrayOps for LargeListArray {
+ fn value_offset_at(&self, i: usize) -> i64 {
+ self.value_offset_at(i)
+ }
+}
+
/// A list array where each element is a variable-sized sequence of values
with the same
/// type.
pub struct ListArray {
@@ -959,6 +986,14 @@ pub struct ListArray {
value_offsets: RawPtrBox<i32>,
}
+/// A list array where each element is a variable-sized sequence of values
with the same
+/// type.
+pub struct LargeListArray {
+ data: ArrayDataRef,
+ values: ArrayRef,
+ value_offsets: RawPtrBox<i64>,
+}
+
impl ListArray {
/// Returns a reference to the values of this list.
pub fn values(&self) -> ArrayRef {
@@ -999,6 +1034,46 @@ impl ListArray {
}
}
+impl LargeListArray {
+ /// Returns a reference to the values of this list.
+ pub fn values(&self) -> ArrayRef {
+ self.values.clone()
+ }
+
+ /// Returns a clone of the value type of this list.
+ pub fn value_type(&self) -> DataType {
+ self.values.data().data_type().clone()
+ }
+
+ /// Returns ith value of this list array.
+ pub fn value(&self, i: usize) -> ArrayRef {
+ self.values
+ .slice(self.value_offset(i) as usize, self.value_length(i) as
usize)
+ }
+
+ /// Returns the offset for value at index `i`.
+ ///
+ /// Note this doesn't do any bound checking, for performance reason.
+ #[inline]
+ pub fn value_offset(&self, i: usize) -> i64 {
+ self.value_offset_at(self.data.offset() + i)
+ }
+
+ /// Returns the length for value at index `i`.
+ ///
+ /// Note this doesn't do any bound checking, for performance reason.
+ #[inline]
+ pub fn value_length(&self, mut i: usize) -> i64 {
+ i += self.data.offset();
+ self.value_offset_at(i + 1) - self.value_offset_at(i)
+ }
+
+ #[inline]
+ fn value_offset_at(&self, i: usize) -> i64 {
+ unsafe { *self.value_offsets.get().add(i) }
+ }
+}
+
/// Constructs a `ListArray` from an array data reference.
impl From<ArrayDataRef> for ListArray {
fn from(data: ArrayDataRef) -> Self {
@@ -1030,6 +1105,37 @@ impl From<ArrayDataRef> for ListArray {
}
}
+/// Constructs a `LargeListArray` from an array data reference.
+impl From<ArrayDataRef> for LargeListArray {
+ fn from(data: ArrayDataRef) -> Self {
+ assert_eq!(
+ data.buffers().len(),
+ 1,
+ "LargeListArray data should contain a single buffer only (value
offsets)"
+ );
+ assert_eq!(
+ data.child_data().len(),
+ 1,
+ "LargeListArray should contain a single child array (values array)"
+ );
+ let values = make_array(data.child_data()[0].clone());
+ let raw_value_offsets = data.buffers()[0].raw_data();
+ assert!(
+ memory::is_aligned(raw_value_offsets, mem::align_of::<i64>()),
+ "memory is not aligned"
+ );
+ let value_offsets = raw_value_offsets as *const i64;
+ unsafe {
+ assert_eq!(*value_offsets.offset(0), 0, "offsets do not start at
zero");
+ }
+ Self {
+ data,
+ values,
+ value_offsets: RawPtrBox::new(value_offsets),
+ }
+ }
+}
+
impl Array for ListArray {
fn as_any(&self) -> &Any {
self
@@ -1044,6 +1150,20 @@ impl Array for ListArray {
}
}
+impl Array for LargeListArray {
+ fn as_any(&self) -> &Any {
+ self
+ }
+
+ fn data(&self) -> ArrayDataRef {
+ self.data.clone()
+ }
+
+ fn data_ref(&self) -> &ArrayDataRef {
+ &self.data
+ }
+}
+
// Helper function for printing potentially long arrays.
fn print_long_array<A, F>(array: &A, f: &mut fmt::Formatter, print_item: F) ->
fmt::Result
where
@@ -1086,6 +1206,16 @@ impl fmt::Debug for ListArray {
}
}
+impl fmt::Debug for LargeListArray {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "LargeListArray\n[\n")?;
+ print_long_array(self, f, |array, index, f| {
+ fmt::Debug::fmt(&array.value(index), f)
+ })?;
+ write!(f, "]")
+ }
+}
+
/// A list array where each element is a fixed-size sequence of values with
the same
/// type.
pub struct FixedSizeListArray {
@@ -1194,20 +1324,69 @@ impl fmt::Debug for FixedSizeListArray {
}
}
-/// A type of `ListArray` whose elements are binaries.
-pub struct BinaryArray {
- data: ArrayDataRef,
- value_offsets: RawPtrBox<i32>,
- value_data: RawPtrBox<u8>,
-}
+macro_rules! make_binary_type {
+ ($name:ident, $offset_ty:ty) => {
+ pub struct $name {
+ data: ArrayDataRef,
+ value_offsets: RawPtrBox<$offset_ty>,
+ value_data: RawPtrBox<u8>,
+ }
-/// A type of `ListArray` whose elements are UTF8 strings.
-pub struct StringArray {
- data: ArrayDataRef,
- value_offsets: RawPtrBox<i32>,
- value_data: RawPtrBox<u8>,
+ impl $name {
+ /// Returns the offset for the element at index `i`.
+ ///
+ /// Note this doesn't do any bound checking, for performance
reason.
+ #[inline]
+ pub fn value_offset(&self, i: usize) -> $offset_ty {
+ self.value_offset_at(self.data.offset() + i)
+ }
+
+ /// Returns the length for the element at index `i`.
+ ///
+ /// Note this doesn't do any bound checking, for performance
reason.
+ #[inline]
+ pub fn value_length(&self, mut i: usize) -> $offset_ty {
+ i += self.data.offset();
+ self.value_offset_at(i + 1) - self.value_offset_at(i)
+ }
+
+ /// Returns a clone of the value offset buffer
+ pub fn value_offsets(&self) -> Buffer {
+ self.data.buffers()[0].clone()
+ }
+
+ /// Returns a clone of the value data buffer
+ pub fn value_data(&self) -> Buffer {
+ self.data.buffers()[1].clone()
+ }
+
+ #[inline]
+ fn value_offset_at(&self, i: usize) -> $offset_ty {
+ unsafe { *self.value_offsets.get().add(i) }
+ }
+ }
+
+ impl Array for $name {
+ fn as_any(&self) -> &Any {
+ self
+ }
+
+ fn data(&self) -> ArrayDataRef {
+ self.data.clone()
+ }
+
+ fn data_ref(&self) -> &ArrayDataRef {
+ &self.data
+ }
+ }
+ };
}
+make_binary_type!(BinaryArray, i32);
+make_binary_type!(LargeBinaryArray, i64);
+make_binary_type!(StringArray, i32);
+make_binary_type!(LargeStringArray, i64);
+
/// A type of `FixedSizeListArray` whose elements are binaries.
pub struct FixedSizeBinaryArray {
data: ArrayDataRef,
@@ -1229,41 +1408,29 @@ impl BinaryArray {
}
}
- /// Returns the offset for the element at index `i`.
- ///
- /// Note this doesn't do any bound checking, for performance reason.
- #[inline]
- pub fn value_offset(&self, i: usize) -> i32 {
- self.value_offset_at(self.data.offset() + i)
- }
-
- /// Returns the length for the element at index `i`.
- ///
- /// Note this doesn't do any bound checking, for performance reason.
- #[inline]
- pub fn value_length(&self, mut i: usize) -> i32 {
- i += self.data.offset();
- self.value_offset_at(i + 1) - self.value_offset_at(i)
- }
-
- /// Returns a clone of the value offset buffer
- pub fn value_offsets(&self) -> Buffer {
- self.data.buffers()[0].clone()
- }
-
- /// Returns a clone of the value data buffer
- pub fn value_data(&self) -> Buffer {
- self.data.buffers()[1].clone()
+ /// Returns a new binary array builder
+ pub fn builder(capacity: usize) -> BinaryBuilder {
+ BinaryBuilder::new(capacity)
}
+}
- #[inline]
- fn value_offset_at(&self, i: usize) -> i32 {
- unsafe { *self.value_offsets.get().add(i) }
+impl LargeBinaryArray {
+ /// Returns the element at index `i` as a byte slice.
+ pub fn value(&self, i: usize) -> &[u8] {
+ assert!(i < self.data.len(), "LargeBinaryArray out of bounds access");
+ let offset = i.checked_add(self.data.offset()).unwrap();
+ unsafe {
+ let pos = self.value_offset_at(offset);
+ std::slice::from_raw_parts(
+ self.value_data.get().offset(pos as isize),
+ (self.value_offset_at(offset + 1) - pos) as usize,
+ )
+ }
}
- // Returns a new binary array builder
- pub fn builder(capacity: usize) -> BinaryBuilder {
- BinaryBuilder::new(capacity)
+ /// Returns a new large binary array builder
+ pub fn builder(capacity: usize) -> LargeBinaryBuilder {
+ LargeBinaryBuilder::new(capacity)
}
}
@@ -1283,41 +1450,31 @@ impl StringArray {
}
}
- /// Returns the offset for the element at index `i`.
- ///
- /// Note this doesn't do any bound checking, for performance reason.
- #[inline]
- pub fn value_offset(&self, i: usize) -> i32 {
- self.value_offset_at(self.data.offset() + i)
- }
-
- /// Returns the length for the element at index `i`.
- ///
- /// Note this doesn't do any bound checking, for performance reason.
- #[inline]
- pub fn value_length(&self, mut i: usize) -> i32 {
- i += self.data.offset();
- self.value_offset_at(i + 1) - self.value_offset_at(i)
- }
-
- /// Returns a clone of the value offset buffer
- pub fn value_offsets(&self) -> Buffer {
- self.data.buffers()[0].clone()
+ /// Returns a new string array builder
+ pub fn builder(capacity: usize) -> StringBuilder {
+ StringBuilder::new(capacity)
}
+}
- /// Returns a clone of the value data buffer
- pub fn value_data(&self) -> Buffer {
- self.data.buffers()[1].clone()
- }
+impl LargeStringArray {
+ /// Returns the element at index `i` as a string slice.
+ pub fn value(&self, i: usize) -> &str {
+ assert!(i < self.data.len(), "LargeStringArray out of bounds access");
+ let offset = i.checked_add(self.data.offset()).unwrap();
+ unsafe {
+ let pos = self.value_offset_at(offset);
+ let slice = std::slice::from_raw_parts(
+ self.value_data.get().offset(pos as isize),
+ (self.value_offset_at(offset + 1) - pos) as usize,
+ );
- #[inline]
- fn value_offset_at(&self, i: usize) -> i32 {
- unsafe { *self.value_offsets.get().add(i) }
+ std::str::from_utf8_unchecked(slice)
+ }
}
- // Returns a new string array builder
- pub fn builder(capacity: usize) -> StringBuilder {
- StringBuilder::new(capacity)
+ // Returns a new large string array builder
+ pub fn builder(capacity: usize) -> LargeStringBuilder {
+ LargeStringBuilder::new(capacity)
}
}
@@ -1386,6 +1543,27 @@ impl From<ArrayDataRef> for BinaryArray {
}
}
+impl From<ArrayDataRef> for LargeBinaryArray {
+ fn from(data: ArrayDataRef) -> Self {
+ assert_eq!(
+ data.buffers().len(),
+ 2,
+ "LargeBinaryArray data should contain 2 buffers only (offsets and
values)"
+ );
+ let raw_value_offsets = data.buffers()[0].raw_data();
+ assert!(
+ memory::is_aligned(raw_value_offsets, mem::align_of::<i64>()),
+ "memory is not aligned"
+ );
+ let value_data = data.buffers()[1].raw_data();
+ Self {
+ data,
+ value_offsets: RawPtrBox::new(raw_value_offsets as *const i64),
+ value_data: RawPtrBox::new(value_data),
+ }
+ }
+}
+
impl From<ArrayDataRef> for StringArray {
fn from(data: ArrayDataRef) -> Self {
assert_eq!(
@@ -1407,6 +1585,27 @@ impl From<ArrayDataRef> for StringArray {
}
}
+impl From<ArrayDataRef> for LargeStringArray {
+ fn from(data: ArrayDataRef) -> Self {
+ assert_eq!(
+ data.buffers().len(),
+ 2,
+ "LargeStringArray data should contain 2 buffers only (offsets and
values)"
+ );
+ let raw_value_offsets = data.buffers()[0].raw_data();
+ assert!(
+ memory::is_aligned(raw_value_offsets, mem::align_of::<i64>()),
+ "memory is not aligned"
+ );
+ let value_data = data.buffers()[1].raw_data();
+ Self {
+ data,
+ value_offsets: RawPtrBox::new(raw_value_offsets as *const i64),
+ value_data: RawPtrBox::new(value_data),
+ }
+ }
+}
+
impl From<ArrayDataRef> for FixedSizeBinaryArray {
fn from(data: ArrayDataRef) -> Self {
assert_eq!(
@@ -1447,6 +1646,26 @@ impl<'a> From<Vec<&'a str>> for StringArray {
}
}
+impl<'a> From<Vec<&'a str>> for LargeStringArray {
+ fn from(v: Vec<&'a str>) -> Self {
+ let mut offsets = Vec::with_capacity(v.len() + 1);
+ let mut values = Vec::new();
+ let mut length_so_far = 0;
+ offsets.push(length_so_far);
+ for s in &v {
+ length_so_far += s.len() as i64;
+ offsets.push(length_so_far as i64);
+ values.extend_from_slice(s.as_bytes());
+ }
+ let array_data = ArrayData::builder(DataType::LargeUtf8)
+ .len(v.len())
+ .add_buffer(Buffer::from(offsets.to_byte_slice()))
+ .add_buffer(Buffer::from(&values[..]))
+ .build();
+ LargeStringArray::from(array_data)
+ }
+}
+
impl From<Vec<&[u8]>> for BinaryArray {
fn from(v: Vec<&[u8]>) -> Self {
let mut offsets = Vec::with_capacity(v.len() + 1);
@@ -1467,6 +1686,26 @@ impl From<Vec<&[u8]>> for BinaryArray {
}
}
+impl From<Vec<&[u8]>> for LargeBinaryArray {
+ fn from(v: Vec<&[u8]>) -> Self {
+ let mut offsets = Vec::with_capacity(v.len() + 1);
+ let mut values = Vec::new();
+ let mut length_so_far = 0;
+ offsets.push(length_so_far);
+ for s in &v {
+ length_so_far += s.len() as i64;
+ offsets.push(length_so_far as i64);
+ values.extend_from_slice(s);
+ }
+ let array_data = ArrayData::builder(DataType::LargeBinary)
+ .len(v.len())
+ .add_buffer(Buffer::from(offsets.to_byte_slice()))
+ .add_buffer(Buffer::from(&values[..]))
+ .build();
+ LargeBinaryArray::from(array_data)
+ }
+}
+
impl<'a> TryFrom<Vec<Option<&'a str>>> for StringArray {
type Error = ArrowError;
@@ -1483,6 +1722,22 @@ impl<'a> TryFrom<Vec<Option<&'a str>>> for StringArray {
}
}
+impl<'a> TryFrom<Vec<Option<&'a str>>> for LargeStringArray {
+ type Error = ArrowError;
+
+ fn try_from(v: Vec<Option<&'a str>>) -> Result<Self> {
+ let mut builder = LargeStringBuilder::new(v.len());
+ for val in v {
+ if let Some(s) = val {
+ builder.append_value(s)?;
+ } else {
+ builder.append(false)?;
+ }
+ }
+ Ok(builder.finish())
+ }
+}
+
/// Creates a `BinaryArray` from `List<u8>` array
impl From<ListArray> for BinaryArray {
fn from(v: ListArray) -> Self {
@@ -1543,6 +1798,66 @@ impl From<ListArray> for StringArray {
}
}
+/// Creates a `LargeBinaryArray` from `LargeList<u8>` array
+impl From<LargeListArray> for LargeBinaryArray {
+ fn from(v: LargeListArray) -> Self {
+ assert_eq!(
+ v.data().child_data()[0].child_data().len(),
+ 0,
+ "LargeBinaryArray can only be created from list array of u8 values
\
+ (i.e. LargeList<PrimitiveArray<u8>>)."
+ );
+ assert_eq!(
+ v.data().child_data()[0].data_type(),
+ &DataType::UInt8,
+ "LargeBinaryArray can only be created from LargeList<u8> arrays,
mismatched data types."
+ );
+
+ let mut builder = ArrayData::builder(DataType::LargeBinary)
+ .len(v.len())
+ .add_buffer(v.data().buffers()[0].clone())
+ .add_buffer(v.data().child_data()[0].buffers()[0].clone());
+ if let Some(bitmap) = v.data().null_bitmap() {
+ builder = builder
+ .null_count(v.data().null_count())
+ .null_bit_buffer(bitmap.bits.clone())
+ }
+
+ let data = builder.build();
+ Self::from(data)
+ }
+}
+
+/// Creates a `LargeStringArray` from `LargeList<u8>` array
+impl From<LargeListArray> for LargeStringArray {
+ fn from(v: LargeListArray) -> Self {
+ assert_eq!(
+ v.data().child_data()[0].child_data().len(),
+ 0,
+ "LargeStringArray can only be created from list array of u8 values
\
+ (i.e. LargeList<PrimitiveArray<u8>>)."
+ );
+ assert_eq!(
+ v.data().child_data()[0].data_type(),
+ &DataType::UInt8,
+ "LargeStringArray can only be created from LargeList<u8> arrays,
mismatched data types."
+ );
+
+ let mut builder = ArrayData::builder(DataType::LargeUtf8)
+ .len(v.len())
+ .add_buffer(v.data().buffers()[0].clone())
+ .add_buffer(v.data().child_data()[0].buffers()[0].clone());
+ if let Some(bitmap) = v.data().null_bitmap() {
+ builder = builder
+ .null_count(v.data().null_count())
+ .null_bit_buffer(bitmap.bits.clone())
+ }
+
+ let data = builder.build();
+ Self::from(data)
+ }
+}
+
/// Creates a `FixedSizeBinaryArray` from `FixedSizeList<u8>` array
impl From<FixedSizeListArray> for FixedSizeBinaryArray {
fn from(v: FixedSizeListArray) -> Self {
@@ -1572,9 +1887,19 @@ impl From<FixedSizeListArray> for FixedSizeBinaryArray {
}
}
-impl fmt::Debug for BinaryArray {
+impl fmt::Debug for BinaryArray {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "BinaryArray\n[\n")?;
+ print_long_array(self, f, |array, index, f| {
+ fmt::Debug::fmt(&array.value(index), f)
+ })?;
+ write!(f, "]")
+ }
+}
+
+impl fmt::Debug for LargeBinaryArray {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "BinaryArray\n[\n")?;
+ write!(f, "LargeBinaryArray\n[\n")?;
print_long_array(self, f, |array, index, f| {
fmt::Debug::fmt(&array.value(index), f)
})?;
@@ -1592,9 +1917,9 @@ impl fmt::Debug for StringArray {
}
}
-impl fmt::Debug for FixedSizeBinaryArray {
+impl fmt::Debug for LargeStringArray {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?;
+ write!(f, "LargeStringArray\n[\n")?;
print_long_array(self, f, |array, index, f| {
fmt::Debug::fmt(&array.value(index), f)
})?;
@@ -1602,31 +1927,13 @@ impl fmt::Debug for FixedSizeBinaryArray {
}
}
-impl Array for BinaryArray {
- fn as_any(&self) -> &Any {
- self
- }
-
- fn data(&self) -> ArrayDataRef {
- self.data.clone()
- }
-
- fn data_ref(&self) -> &ArrayDataRef {
- &self.data
- }
-}
-
-impl Array for StringArray {
- fn as_any(&self) -> &Any {
- self
- }
-
- fn data(&self) -> ArrayDataRef {
- self.data.clone()
- }
-
- fn data_ref(&self) -> &ArrayDataRef {
- &self.data
+impl fmt::Debug for FixedSizeBinaryArray {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?;
+ print_long_array(self, f, |array, index, f| {
+ fmt::Debug::fmt(&array.value(index), f)
+ })?;
+ write!(f, "]")
}
}
@@ -2537,6 +2844,75 @@ mod tests {
}
#[test]
+ fn test_large_list_array() {
+ // Construct a value array
+ let value_data = ArrayData::builder(DataType::Int32)
+ .len(8)
+ .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6,
7].to_byte_slice()))
+ .build();
+
+ // Construct a buffer for value offsets, for the nested array:
+ // [[0, 1, 2], [3, 4, 5], [6, 7]]
+ let value_offsets = Buffer::from(&[0i64, 3, 6, 8].to_byte_slice());
+
+ // Construct a list array from the above two
+ let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+ let list_data = ArrayData::builder(list_data_type.clone())
+ .len(3)
+ .add_buffer(value_offsets.clone())
+ .add_child_data(value_data.clone())
+ .build();
+ let list_array = LargeListArray::from(list_data);
+
+ let values = list_array.values();
+ assert_eq!(value_data, values.data());
+ assert_eq!(DataType::Int32, list_array.value_type());
+ assert_eq!(3, list_array.len());
+ assert_eq!(0, list_array.null_count());
+ assert_eq!(6, list_array.value_offset(2));
+ assert_eq!(2, list_array.value_length(2));
+ assert_eq!(
+ 0,
+ list_array
+ .value(0)
+ .as_any()
+ .downcast_ref::<Int32Array>()
+ .unwrap()
+ .value(0)
+ );
+ for i in 0..3 {
+ assert!(list_array.is_valid(i));
+ assert!(!list_array.is_null(i));
+ }
+
+ // Now test with a non-zero offset
+ let list_data = ArrayData::builder(list_data_type)
+ .len(3)
+ .offset(1)
+ .add_buffer(value_offsets)
+ .add_child_data(value_data.clone())
+ .build();
+ let list_array = LargeListArray::from(list_data);
+
+ let values = list_array.values();
+ assert_eq!(value_data, values.data());
+ assert_eq!(DataType::Int32, list_array.value_type());
+ assert_eq!(3, list_array.len());
+ assert_eq!(0, list_array.null_count());
+ assert_eq!(6, list_array.value_offset(1));
+ assert_eq!(2, list_array.value_length(1));
+ assert_eq!(
+ 3,
+ list_array
+ .value(0)
+ .as_any()
+ .downcast_ref::<Int32Array>()
+ .unwrap()
+ .value(0)
+ );
+ }
+
+ #[test]
fn test_dictionary_array() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int8)
@@ -2747,6 +3123,72 @@ mod tests {
}
#[test]
+ fn test_large_list_array_slice() {
+ // Construct a value array
+ let value_data = ArrayData::builder(DataType::Int32)
+ .len(10)
+ .add_buffer(Buffer::from(
+ &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9].to_byte_slice(),
+ ))
+ .build();
+
+ // Construct a buffer for value offsets, for the nested array:
+ // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]]
+ let value_offsets =
+ Buffer::from(&[0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10].to_byte_slice());
+ // 01011001 00000001
+ let mut null_bits: [u8; 2] = [0; 2];
+ bit_util::set_bit(&mut null_bits, 0);
+ bit_util::set_bit(&mut null_bits, 3);
+ bit_util::set_bit(&mut null_bits, 4);
+ bit_util::set_bit(&mut null_bits, 6);
+ bit_util::set_bit(&mut null_bits, 8);
+
+ // Construct a list array from the above two
+ let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+ let list_data = ArrayData::builder(list_data_type.clone())
+ .len(9)
+ .add_buffer(value_offsets.clone())
+ .add_child_data(value_data.clone())
+ .null_bit_buffer(Buffer::from(null_bits))
+ .build();
+ let list_array = LargeListArray::from(list_data);
+
+ let values = list_array.values();
+ assert_eq!(value_data, values.data());
+ assert_eq!(DataType::Int32, list_array.value_type());
+ assert_eq!(9, list_array.len());
+ assert_eq!(4, list_array.null_count());
+ assert_eq!(2, list_array.value_offset(3));
+ assert_eq!(2, list_array.value_length(3));
+
+ let sliced_array = list_array.slice(1, 6);
+ assert_eq!(6, sliced_array.len());
+ assert_eq!(1, sliced_array.offset());
+ assert_eq!(3, sliced_array.null_count());
+
+ for i in 0..sliced_array.len() {
+ if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+ assert!(sliced_array.is_valid(i));
+ } else {
+ assert!(sliced_array.is_null(i));
+ }
+ }
+
+ // Check offset and length for each non-null value.
+ let sliced_list_array = sliced_array
+ .as_any()
+ .downcast_ref::<LargeListArray>()
+ .unwrap();
+ assert_eq!(2, sliced_list_array.value_offset(2));
+ assert_eq!(2, sliced_list_array.value_length(2));
+ assert_eq!(4, sliced_list_array.value_offset(3));
+ assert_eq!(2, sliced_list_array.value_length(3));
+ assert_eq!(6, sliced_list_array.value_offset(5));
+ assert_eq!(3, sliced_list_array.value_length(5));
+ }
+
+ #[test]
fn test_fixed_size_list_array_slice() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
@@ -2902,6 +3344,53 @@ mod tests {
}
#[test]
+ fn test_large_binary_array() {
+ let values: [u8; 12] = [
+ b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e',
b't',
+ ];
+ let offsets: [i64; 4] = [0, 5, 5, 12];
+
+ // Array data: ["hello", "", "parquet"]
+ let array_data = ArrayData::builder(DataType::Binary)
+ .len(3)
+ .add_buffer(Buffer::from(offsets.to_byte_slice()))
+ .add_buffer(Buffer::from(&values[..]))
+ .build();
+ let binary_array = LargeBinaryArray::from(array_data);
+ assert_eq!(3, binary_array.len());
+ assert_eq!(0, binary_array.null_count());
+ assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
+ assert_eq!([] as [u8; 0], binary_array.value(1));
+ assert_eq!(
+ [b'p', b'a', b'r', b'q', b'u', b'e', b't'],
+ binary_array.value(2)
+ );
+ assert_eq!(5, binary_array.value_offset(2));
+ assert_eq!(7, binary_array.value_length(2));
+ for i in 0..3 {
+ assert!(binary_array.is_valid(i));
+ assert!(!binary_array.is_null(i));
+ }
+
+ // Test binary array with offset
+ let array_data = ArrayData::builder(DataType::LargeBinary)
+ .len(4)
+ .offset(1)
+ .add_buffer(Buffer::from(offsets.to_byte_slice()))
+ .add_buffer(Buffer::from(&values[..]))
+ .build();
+ let binary_array = LargeBinaryArray::from(array_data);
+ assert_eq!(
+ [b'p', b'a', b'r', b'q', b'u', b'e', b't'],
+ binary_array.value(1)
+ );
+ assert_eq!(5, binary_array.value_offset(0));
+ assert_eq!(0, binary_array.value_length(0));
+ assert_eq!(5, binary_array.value_offset(1));
+ assert_eq!(7, binary_array.value_length(1));
+ }
+
+ #[test]
fn test_binary_array_from_list_array() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e',
b't',
@@ -2941,6 +3430,45 @@ mod tests {
}
#[test]
+ fn test_large_binary_array_from_list_array() {
+ let values: [u8; 12] = [
+ b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e',
b't',
+ ];
+ let values_data = ArrayData::builder(DataType::UInt8)
+ .len(12)
+ .add_buffer(Buffer::from(&values[..]))
+ .build();
+ let offsets: [i64; 4] = [0, 5, 5, 12];
+
+ // Array data: ["hello", "", "parquet"]
+ let array_data1 = ArrayData::builder(DataType::LargeBinary)
+ .len(3)
+ .add_buffer(Buffer::from(offsets.to_byte_slice()))
+ .add_buffer(Buffer::from(&values[..]))
+ .build();
+ let binary_array1 = LargeBinaryArray::from(array_data1);
+
+ let array_data2 = ArrayData::builder(DataType::Binary)
+ .len(3)
+ .add_buffer(Buffer::from(offsets.to_byte_slice()))
+ .add_child_data(values_data)
+ .build();
+ let list_array = LargeListArray::from(array_data2);
+ let binary_array2 = LargeBinaryArray::from(list_array);
+
+ assert_eq!(2, binary_array2.data().buffers().len());
+ assert_eq!(0, binary_array2.data().child_data().len());
+
+ assert_eq!(binary_array1.len(), binary_array2.len());
+ assert_eq!(binary_array1.null_count(), binary_array2.null_count());
+ for i in 0..binary_array1.len() {
+ assert_eq!(binary_array1.value(i), binary_array2.value(i));
+ assert_eq!(binary_array1.value_offset(i),
binary_array2.value_offset(i));
+ assert_eq!(binary_array1.value_length(i),
binary_array2.value_length(i));
+ }
+ }
+
+ #[test]
fn test_string_array_from_u8_slice() {
let values: Vec<&str> = vec!["hello", "", "parquet"];
@@ -2961,6 +3489,26 @@ mod tests {
}
#[test]
+ fn test_large_string_array_from_u8_slice() {
+ let values: Vec<&str> = vec!["hello", "", "parquet"];
+
+ // Array data: ["hello", "", "parquet"]
+ let string_array = LargeStringArray::from(values);
+
+ assert_eq!(3, string_array.len());
+ assert_eq!(0, string_array.null_count());
+ assert_eq!("hello", string_array.value(0));
+ assert_eq!("", string_array.value(1));
+ assert_eq!("parquet", string_array.value(2));
+ assert_eq!(5, string_array.value_offset(2));
+ assert_eq!(7, string_array.value_length(2));
+ for i in 0..3 {
+ assert!(string_array.is_valid(i));
+ assert!(!string_array.is_null(i));
+ }
+ }
+
+ #[test]
fn test_nested_string_array() {
let string_builder = StringBuilder::new(3);
let mut list_of_string_builder = ListBuilder::new(string_builder);
@@ -3164,10 +3712,10 @@ mod tests {
}
#[test]
- fn test_fixed_size_binary_array_fmt_debug() {
- let arr: StringArray = vec!["hello", "arrow"].into();
+ fn test_large_string_array_fmt_debug() {
+ let arr: LargeStringArray = vec!["hello", "arrow"].into();
assert_eq!(
- "StringArray\n[\n \"hello\",\n \"arrow\",\n]",
+ "LargeStringArray\n[\n \"hello\",\n \"arrow\",\n]",
format!("{:?}", arr)
);
}
diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs
index 6e8f196..691f7cd 100644
--- a/rust/arrow/src/array/builder.rs
+++ b/rust/arrow/src/array/builder.rs
@@ -824,6 +824,207 @@ where
}
/// Array builder for `ListArray`
+pub struct LargeListBuilder<T: ArrayBuilder> {
+ offsets_builder: Int64BufferBuilder,
+ bitmap_builder: BooleanBufferBuilder,
+ values_builder: T,
+ len: usize,
+}
+
+impl<T: ArrayBuilder> LargeListBuilder<T> {
+ /// Creates a new `LargeListArrayBuilder` from a given values array builder
+ pub fn new(values_builder: T) -> Self {
+ let capacity = values_builder.len();
+ Self::with_capacity(values_builder, capacity)
+ }
+
+ /// Creates a new `LargeListArrayBuilder` from a given values array builder
+ /// `capacity` is the number of items to pre-allocate space for in this
builder
+ pub fn with_capacity(values_builder: T, capacity: usize) -> Self {
+ let mut offsets_builder = Int64BufferBuilder::new(capacity + 1);
+ offsets_builder.append(0).unwrap();
+ Self {
+ offsets_builder,
+ bitmap_builder: BooleanBufferBuilder::new(capacity),
+ values_builder,
+ len: 0,
+ }
+ }
+}
+
+impl<T: ArrayBuilder> ArrayBuilder for LargeListBuilder<T>
+where
+ T: 'static,
+{
+ /// Returns the builder as a non-mutable `Any` reference.
+ fn as_any(&self) -> &Any {
+ self
+ }
+
+ /// Appends data from other arrays into the builder
+ ///
+ /// This is most useful when concatenating arrays of the same type into a
builder.
+ fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> {
+ // validate arraydata and reserve memory
+ let mut total_len = 0;
+ for array in data {
+ if array.data_type() != &self.data_type() {
+ return Err(ArrowError::InvalidArgumentError(
+ "Cannot append data to builder if data types are different"
+ .to_string(),
+ ));
+ }
+ if array.buffers().len() != 1 {
+ return Err(ArrowError::InvalidArgumentError(
+ "List arrays should have 1 buffer".to_string(),
+ ));
+ }
+ if array.child_data().len() != 1 {
+ return Err(ArrowError::InvalidArgumentError(
+ "List arrays should have 1 child_data element".to_string(),
+ ));
+ }
+ total_len += array.len();
+ }
+ // reserve memory
+ self.offsets_builder.reserve(total_len)?;
+ self.bitmap_builder.reserve(total_len)?;
+ // values_builder is allocated by the relevant builder, and is not
allocated here
+
+ // determine the latest offset on the builder
+ let mut cum_offset = if self.offsets_builder.len() == 0 {
+ 0
+ } else {
+ // peek into buffer to get last appended offset
+ let buffer = self.offsets_builder.buffer.data();
+ let len = self.offsets_builder.len();
+ let (start, end) = ((len - 1) * 8, len * 8);
+ let slice = &buffer[start..end];
+ i64::from_le_bytes(slice.try_into().unwrap())
+ };
+ for array in data {
+ let len = array.len();
+ if len == 0 {
+ continue;
+ }
+ let offset = array.offset();
+
+ // `typed_data` is unsafe, however this call is safe as
`LargeListArray` has i64 offsets
+ let offsets = unsafe {
+ &array.buffers()[0].typed_data::<i64>()[offset..(len + offset)
+ 1]
+ };
+ // the offsets of the child array determine its length
+ // this could be obtained by getting the concrete ListArray and
getting value_offsets
+ let offset_at_len = offsets[offsets.len() - 1] as usize;
+ let first_offset = offsets[0] as usize;
+ // create the child array and offset it
+ let child_data = &array.child_data()[0];
+ let child_array = make_array(child_data.clone());
+ // slice the child array to account for offsets
+ let sliced = child_array.slice(first_offset, offset_at_len -
first_offset);
+ self.values().append_data(&[sliced.data()])?;
+ let adjusted_offsets: Vec<i64> = offsets
+ .windows(2)
+ .into_iter()
+ .map(|w| {
+ let curr_offset = w[1] - w[0] + cum_offset;
+ cum_offset = curr_offset;
+ curr_offset
+ })
+ .collect();
+ self.offsets_builder
+ .append_slice(adjusted_offsets.as_slice())?;
+
+ for i in 0..len {
+ // account for offset as `ArrayData` does not
+ self.bitmap_builder.append(array.is_valid(offset + i))?;
+ }
+ }
+
+ // append array length
+ self.len += total_len;
+ Ok(())
+ }
+
+ /// Returns the data type of the builder
+ ///
+ /// This is used for validating array data types in `append_data`
+ fn data_type(&self) -> DataType {
+ DataType::LargeList(Box::new(self.values_builder.data_type()))
+ }
+
+ /// Returns the builder as a mutable `Any` reference.
+ fn as_any_mut(&mut self) -> &mut Any {
+ self
+ }
+
+ /// Returns the boxed builder as a box of `Any`.
+ fn into_box_any(self: Box<Self>) -> Box<Any> {
+ self
+ }
+
+ /// Returns the number of array slots in the builder
+ fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Builds the array and reset this builder.
+ fn finish(&mut self) -> ArrayRef {
+ Arc::new(self.finish())
+ }
+}
+
+impl<T: ArrayBuilder> LargeListBuilder<T>
+where
+ T: 'static,
+{
+ /// Returns the child array builder as a mutable reference.
+ ///
+ /// This mutable reference can be used to append values into the child
array builder,
+ /// but you must call `append` to delimit each distinct list value.
+ pub fn values(&mut self) -> &mut T {
+ &mut self.values_builder
+ }
+
+ /// Finish the current variable-length list array slot
+ pub fn append(&mut self, is_valid: bool) -> Result<()> {
+ self.offsets_builder
+ .append(self.values_builder.len() as i64)?;
+ self.bitmap_builder.append(is_valid)?;
+ self.len += 1;
+ Ok(())
+ }
+
+ /// Builds the `LargeListArray` and reset this builder.
+ pub fn finish(&mut self) -> LargeListArray {
+ let len = self.len();
+ self.len = 0;
+ let values_arr = self
+ .values_builder
+ .as_any_mut()
+ .downcast_mut::<T>()
+ .unwrap()
+ .finish();
+ let values_data = values_arr.data();
+
+ let offset_buffer = self.offsets_builder.finish();
+ let null_bit_buffer = self.bitmap_builder.finish();
+ self.offsets_builder.append(0).unwrap();
+ let data = ArrayData::builder(DataType::LargeList(Box::new(
+ values_data.data_type().clone(),
+ )))
+ .len(len)
+ .null_count(len - bit_util::count_set_bits(null_bit_buffer.data()))
+ .add_buffer(offset_buffer)
+ .add_child_data(values_data)
+ .null_bit_buffer(null_bit_buffer)
+ .build();
+
+ LargeListArray::from(data)
+ }
+}
+
+/// Array builder for `ListArray`
pub struct FixedSizeListBuilder<T: ArrayBuilder> {
bitmap_builder: BooleanBufferBuilder,
values_builder: T,
@@ -1007,10 +1208,18 @@ pub struct BinaryBuilder {
builder: ListBuilder<UInt8Builder>,
}
+pub struct LargeBinaryBuilder {
+ builder: LargeListBuilder<UInt8Builder>,
+}
+
pub struct StringBuilder {
builder: ListBuilder<UInt8Builder>,
}
+pub struct LargeStringBuilder {
+ builder: LargeListBuilder<UInt8Builder>,
+}
+
pub struct FixedSizeBinaryBuilder {
builder: FixedSizeListBuilder<UInt8Builder>,
}
@@ -1019,6 +1228,8 @@ pub trait BinaryArrayBuilder: ArrayBuilder {}
impl BinaryArrayBuilder for BinaryBuilder {}
impl BinaryArrayBuilder for StringBuilder {}
+impl BinaryArrayBuilder for LargeStringBuilder {}
+impl BinaryArrayBuilder for LargeBinaryBuilder {}
impl BinaryArrayBuilder for FixedSizeBinaryBuilder {}
impl ArrayBuilder for BinaryBuilder {
@@ -1062,6 +1273,47 @@ impl ArrayBuilder for BinaryBuilder {
}
}
+impl ArrayBuilder for LargeBinaryBuilder {
+ /// Returns the builder as a non-mutable `Any` reference.
+ fn as_any(&self) -> &Any {
+ self
+ }
+
+ /// Returns the builder as a mutable `Any` reference.
+ fn as_any_mut(&mut self) -> &mut Any {
+ self
+ }
+
+ /// Appends data from other arrays into the builder
+ ///
+ /// This is most useful when concatenating arrays of the same type into a
builder.
+ fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> {
+ append_large_binary_data(&mut self.builder, &DataType::LargeBinary,
data)
+ }
+
+ /// Returns the data type of the builder
+ ///
+ /// This is used for validating array data types in `append_data`
+ fn data_type(&self) -> DataType {
+ DataType::LargeBinary
+ }
+
+ /// Returns the boxed builder as a box of `Any`.
+ fn into_box_any(self: Box<Self>) -> Box<Any> {
+ self
+ }
+
+ /// Returns the number of array slots in the builder
+ fn len(&self) -> usize {
+ self.builder.len()
+ }
+
+ /// Builds the array and reset this builder.
+ fn finish(&mut self) -> ArrayRef {
+ Arc::new(self.finish())
+ }
+}
+
impl ArrayBuilder for StringBuilder {
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &Any {
@@ -1155,6 +1407,99 @@ fn append_binary_data(
Ok(())
}
+// Helper function for appending LargeBinary and LargeUtf8 data
+fn append_large_binary_data(
+ builder: &mut LargeListBuilder<UInt8Builder>,
+ data_type: &DataType,
+ data: &[ArrayDataRef],
+) -> Result<()> {
+ // validate arraydata and reserve memory
+ for array in data {
+ if array.data_type() != data_type {
+ return Err(ArrowError::InvalidArgumentError(
+ "Cannot append data to builder if data types are
different".to_string(),
+ ));
+ }
+ if array.buffers().len() != 2 {
+ return Err(ArrowError::InvalidArgumentError(
+ "Binary/String arrays should have 2 buffers".to_string(),
+ ));
+ }
+ }
+
+ builder.append_data(
+ &data
+ .iter()
+ .map(|array| {
+ // convert string to List<u8> to reuse list's cast
+ let int_data = &array.buffers()[1];
+ let int_data = Arc::new(ArrayData::new(
+ DataType::UInt8,
+ int_data.len(),
+ None,
+ None,
+ 0,
+ vec![int_data.clone()],
+ vec![],
+ )) as ArrayDataRef;
+
+ Arc::new(ArrayData::new(
+ DataType::LargeList(Box::new(DataType::UInt8)),
+ array.len(),
+ None,
+ array.null_buffer().map(|buf| buf.clone()),
+ array.offset(),
+ vec![(&array.buffers()[0]).clone()],
+ vec![int_data],
+ ))
+ })
+ .collect::<Vec<ArrayDataRef>>(),
+ )?;
+
+ Ok(())
+}
+
+impl ArrayBuilder for LargeStringBuilder {
+ /// Returns the builder as a non-mutable `Any` reference.
+ fn as_any(&self) -> &Any {
+ self
+ }
+
+ /// Returns the builder as a mutable `Any` reference.
+ fn as_any_mut(&mut self) -> &mut Any {
+ self
+ }
+
+ /// Appends data from other arrays into the builder
+ ///
+ /// This is most useful when concatenating arrays of the same type into a
builder.
+ fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> {
+ append_large_binary_data(&mut self.builder, &DataType::LargeUtf8, data)
+ }
+
+ /// Returns the data type of the builder
+ ///
+ /// This is used for validating array data types in `append_data`
+ fn data_type(&self) -> DataType {
+ DataType::LargeUtf8
+ }
+
+ /// Returns the boxed builder as a box of `Any`.
+ fn into_box_any(self: Box<Self>) -> Box<Any> {
+ self
+ }
+
+ /// Returns the number of array slots in the builder
+ fn len(&self) -> usize {
+ self.builder.len()
+ }
+
+ /// Builds the array and reset this builder.
+ fn finish(&mut self) -> ArrayRef {
+ Arc::new(self.finish())
+ }
+}
+
impl ArrayBuilder for FixedSizeBinaryBuilder {
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &Any {
@@ -1278,6 +1623,51 @@ impl BinaryBuilder {
}
}
+impl LargeBinaryBuilder {
+ /// Creates a new `LargeBinaryBuilder`, `capacity` is the number of bytes
in the values
+ /// array
+ pub fn new(capacity: usize) -> Self {
+ let values_builder = UInt8Builder::new(capacity);
+ Self {
+ builder: LargeListBuilder::new(values_builder),
+ }
+ }
+
+ /// Appends a single byte value into the builder's values array.
+ ///
+ /// Note, when appending individual byte values you must call `append` to
delimit each
+ /// distinct list value.
+ pub fn append_byte(&mut self, value: u8) -> Result<()> {
+ self.builder.values().append_value(value)?;
+ Ok(())
+ }
+
+ /// Appends a byte slice into the builder.
+ ///
+ /// Automatically calls the `append` method to delimit the slice appended
in as a
+ /// distinct array element.
+ pub fn append_value(&mut self, value: &[u8]) -> Result<()> {
+ self.builder.values().append_slice(value)?;
+ self.builder.append(true)?;
+ Ok(())
+ }
+
+ /// Finish the current variable-length list array slot.
+ pub fn append(&mut self, is_valid: bool) -> Result<()> {
+ self.builder.append(is_valid)
+ }
+
+ /// Append a null value to the array.
+ pub fn append_null(&mut self) -> Result<()> {
+ self.append(false)
+ }
+
+ /// Builds the `LargeBinaryArray` and reset this builder.
+ pub fn finish(&mut self) -> LargeBinaryArray {
+ LargeBinaryArray::from(self.builder.finish())
+ }
+}
+
impl StringBuilder {
/// Creates a new `StringBuilder`,
/// `capacity` is the number of bytes of string data to pre-allocate space
for in this builder
@@ -1324,6 +1714,52 @@ impl StringBuilder {
}
}
+impl LargeStringBuilder {
+ /// Creates a new `StringBuilder`,
+ /// `capacity` is the number of bytes of string data to pre-allocate space
for in this builder
+ pub fn new(capacity: usize) -> Self {
+ let values_builder = UInt8Builder::new(capacity);
+ Self {
+ builder: LargeListBuilder::new(values_builder),
+ }
+ }
+
+ /// Creates a new `StringBuilder`,
+ /// `data_capacity` is the number of bytes of string data to pre-allocate
space for in this builder
+ /// `item_capacity` is the number of items to pre-allocate space for in
this builder
+ pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let values_builder = UInt8Builder::new(data_capacity);
+ Self {
+ builder: LargeListBuilder::with_capacity(values_builder,
item_capacity),
+ }
+ }
+
+ /// Appends a string into the builder.
+ ///
+ /// Automatically calls the `append` method to delimit the string appended
in as a
+ /// distinct array element.
+ pub fn append_value(&mut self, value: &str) -> Result<()> {
+ self.builder.values().append_slice(value.as_bytes())?;
+ self.builder.append(true)?;
+ Ok(())
+ }
+
+ /// Finish the current variable-length list array slot.
+ pub fn append(&mut self, is_valid: bool) -> Result<()> {
+ self.builder.append(is_valid)
+ }
+
+ /// Append a null value to the array.
+ pub fn append_null(&mut self) -> Result<()> {
+ self.append(false)
+ }
+
+ /// Builds the `LargeStringArray` and reset this builder.
+ pub fn finish(&mut self) -> LargeStringArray {
+ LargeStringArray::from(self.builder.finish())
+ }
+}
+
impl FixedSizeBinaryBuilder {
/// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in
the values
/// array
@@ -2313,6 +2749,45 @@ mod tests {
}
#[test]
+ fn test_large_list_array_builder() {
+ let values_builder = Int32Builder::new(10);
+ let mut builder = LargeListBuilder::new(values_builder);
+
+ // [[0, 1, 2], [3, 4, 5], [6, 7]]
+ builder.values().append_value(0).unwrap();
+ builder.values().append_value(1).unwrap();
+ builder.values().append_value(2).unwrap();
+ builder.append(true).unwrap();
+ builder.values().append_value(3).unwrap();
+ builder.values().append_value(4).unwrap();
+ builder.values().append_value(5).unwrap();
+ builder.append(true).unwrap();
+ builder.values().append_value(6).unwrap();
+ builder.values().append_value(7).unwrap();
+ builder.append(true).unwrap();
+ let list_array = builder.finish();
+
+ let values = list_array.values().data().buffers()[0].clone();
+ assert_eq!(
+ Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()),
+ values
+ );
+ assert_eq!(
+ Buffer::from(&[0i64, 3, 6, 8].to_byte_slice()),
+ list_array.data().buffers()[0].clone()
+ );
+ assert_eq!(DataType::Int32, list_array.value_type());
+ assert_eq!(3, list_array.len());
+ assert_eq!(0, list_array.null_count());
+ assert_eq!(6, list_array.value_offset(2));
+ assert_eq!(2, list_array.value_length(2));
+ for i in 0..3 {
+ assert!(list_array.is_valid(i));
+ assert!(!list_array.is_null(i));
+ }
+ }
+
+ #[test]
fn test_list_array_builder_nulls() {
let values_builder = Int32Builder::new(10);
let mut builder = ListBuilder::new(values_builder);
@@ -2340,6 +2815,33 @@ mod tests {
}
#[test]
+ fn test_large_list_array_builder_nulls() {
+ let values_builder = Int32Builder::new(10);
+ let mut builder = LargeListBuilder::new(values_builder);
+
+ // [[0, 1, 2], null, [3, null, 5], [6, 7]]
+ builder.values().append_value(0).unwrap();
+ builder.values().append_value(1).unwrap();
+ builder.values().append_value(2).unwrap();
+ builder.append(true).unwrap();
+ builder.append(false).unwrap();
+ builder.values().append_value(3).unwrap();
+ builder.values().append_null().unwrap();
+ builder.values().append_value(5).unwrap();
+ builder.append(true).unwrap();
+ builder.values().append_value(6).unwrap();
+ builder.values().append_value(7).unwrap();
+ builder.append(true).unwrap();
+ let list_array = builder.finish();
+
+ assert_eq!(DataType::Int32, list_array.value_type());
+ assert_eq!(4, list_array.len());
+ assert_eq!(1, list_array.null_count());
+ assert_eq!(3, list_array.value_offset(2));
+ assert_eq!(3, list_array.value_length(2));
+ }
+
+ #[test]
fn test_fixed_size_list_array_builder() {
let values_builder = Int32Builder::new(10);
let mut builder = FixedSizeListBuilder::new(values_builder, 3);
@@ -2509,6 +3011,37 @@ mod tests {
}
#[test]
+ fn test_large_binary_array_builder() {
+ let mut builder = LargeBinaryBuilder::new(20);
+
+ builder.append_byte(b'h').unwrap();
+ builder.append_byte(b'e').unwrap();
+ builder.append_byte(b'l').unwrap();
+ builder.append_byte(b'l').unwrap();
+ builder.append_byte(b'o').unwrap();
+ builder.append(true).unwrap();
+ builder.append(true).unwrap();
+ builder.append_byte(b'w').unwrap();
+ builder.append_byte(b'o').unwrap();
+ builder.append_byte(b'r').unwrap();
+ builder.append_byte(b'l').unwrap();
+ builder.append_byte(b'd').unwrap();
+ builder.append(true).unwrap();
+
+ let array = builder.finish();
+
+ let binary_array = LargeBinaryArray::from(array);
+
+ assert_eq!(3, binary_array.len());
+ assert_eq!(0, binary_array.null_count());
+ assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
+ assert_eq!([] as [u8; 0], binary_array.value(1));
+ assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2));
+ assert_eq!(5, binary_array.value_offset(2));
+ assert_eq!(5, binary_array.value_length(2));
+ }
+
+ #[test]
fn test_string_array_builder() {
let mut builder = StringBuilder::new(20);
diff --git a/rust/arrow/src/array/equal.rs b/rust/arrow/src/array/equal.rs
index ab76c5d..02707a5 100644
--- a/rust/arrow/src/array/equal.rs
+++ b/rust/arrow/src/array/equal.rs
@@ -227,6 +227,78 @@ impl ArrayEqual for ListArray {
}
}
+impl ArrayEqual for LargeListArray {
+ fn equals(&self, other: &dyn Array) -> bool {
+ if !base_equal(&self.data(), &other.data()) {
+ return false;
+ }
+
+ let other = other.as_any().downcast_ref::<LargeListArray>().unwrap();
+
+ if !large_value_offset_equal(self, other) {
+ return false;
+ }
+
+ if !self.values().range_equals(
+ &*other.values(),
+ self.value_offset(0) as usize,
+ self.value_offset(self.len()) as usize,
+ other.value_offset(0) as usize,
+ ) {
+ return false;
+ }
+
+ true
+ }
+
+ fn range_equals(
+ &self,
+ other: &dyn Array,
+ start_idx: usize,
+ end_idx: usize,
+ other_start_idx: usize,
+ ) -> bool {
+ assert!(other_start_idx + (end_idx - start_idx) <= other.len());
+ let other = other.as_any().downcast_ref::<LargeListArray>().unwrap();
+
+ let mut j = other_start_idx;
+ for i in start_idx..end_idx {
+ let is_null = self.is_null(i);
+ let other_is_null = other.is_null(j);
+
+ if is_null != other_is_null {
+ return false;
+ }
+
+ if is_null {
+ continue;
+ }
+
+ let start_offset = self.value_offset(i) as usize;
+ let end_offset = self.value_offset(i + 1) as usize;
+ let other_start_offset = other.value_offset(j) as usize;
+ let other_end_offset = other.value_offset(j + 1) as usize;
+
+ if end_offset - start_offset != other_end_offset -
other_start_offset {
+ return false;
+ }
+
+ if !self.values().range_equals(
+ &*other.values(),
+ start_offset,
+ end_offset,
+ other_start_offset,
+ ) {
+ return false;
+ }
+
+ j += 1;
+ }
+
+ true
+ }
+}
+
impl<T: ArrowPrimitiveType> ArrayEqual for DictionaryArray<T> {
fn equals(&self, other: &dyn Array) -> bool {
self.range_equals(other, 0, self.len(), 0)
@@ -529,6 +601,214 @@ impl ArrayEqual for StringArray {
}
}
+impl ArrayEqual for LargeBinaryArray {
+ fn equals(&self, other: &dyn Array) -> bool {
+ if !base_equal(&self.data(), &other.data()) {
+ return false;
+ }
+
+ let other = other.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
+
+ if !large_value_offset_equal(self, other) {
+ return false;
+ }
+
+ // TODO: handle null & length == 0 case?
+
+ let value_buf = self.value_data();
+ let other_value_buf = other.value_data();
+ let value_data = value_buf.data();
+ let other_value_data = other_value_buf.data();
+
+ if self.null_count() == 0 {
+ // No offset in both - just do memcmp
+ if self.offset() == 0 && other.offset() == 0 {
+ let len = self.value_offset(self.len()) as usize;
+ return value_data[..len] == other_value_data[..len];
+ } else {
+ let start = self.value_offset(0) as usize;
+ let other_start = other.value_offset(0) as usize;
+ let len = (self.value_offset(self.len()) -
self.value_offset(0)) as usize;
+ return value_data[start..(start + len)]
+ == other_value_data[other_start..(other_start + len)];
+ }
+ } else {
+ for i in 0..self.len() {
+ if self.is_null(i) {
+ continue;
+ }
+
+ let start = self.value_offset(i) as usize;
+ let other_start = other.value_offset(i) as usize;
+ let len = self.value_length(i) as usize;
+ if value_data[start..(start + len)]
+ != other_value_data[other_start..(other_start + len)]
+ {
+ return false;
+ }
+ }
+ }
+
+ true
+ }
+
+ fn range_equals(
+ &self,
+ other: &dyn Array,
+ start_idx: usize,
+ end_idx: usize,
+ other_start_idx: usize,
+ ) -> bool {
+ assert!(other_start_idx + (end_idx - start_idx) <= other.len());
+ let other = other.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
+
+ let mut j = other_start_idx;
+ for i in start_idx..end_idx {
+ let is_null = self.is_null(i);
+ let other_is_null = other.is_null(j);
+
+ if is_null != other_is_null {
+ return false;
+ }
+
+ if is_null {
+ continue;
+ }
+
+ let start_offset = self.value_offset(i) as usize;
+ let end_offset = self.value_offset(i + 1) as usize;
+ let other_start_offset = other.value_offset(j) as usize;
+ let other_end_offset = other.value_offset(j + 1) as usize;
+
+ if end_offset - start_offset != other_end_offset -
other_start_offset {
+ return false;
+ }
+
+ let value_buf = self.value_data();
+ let other_value_buf = other.value_data();
+ let value_data = value_buf.data();
+ let other_value_data = other_value_buf.data();
+
+ if end_offset - start_offset > 0 {
+ let len = end_offset - start_offset;
+ if value_data[start_offset..(start_offset + len)]
+ !=
other_value_data[other_start_offset..(other_start_offset + len)]
+ {
+ return false;
+ }
+ }
+
+ j += 1;
+ }
+
+ true
+ }
+}
+
+impl ArrayEqual for LargeStringArray {
+ fn equals(&self, other: &dyn Array) -> bool {
+ if !base_equal(&self.data(), &other.data()) {
+ return false;
+ }
+
+ let other = other.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+ if !large_value_offset_equal(self, other) {
+ return false;
+ }
+
+ // TODO: handle null & length == 0 case?
+
+ let value_buf = self.value_data();
+ let other_value_buf = other.value_data();
+ let value_data = value_buf.data();
+ let other_value_data = other_value_buf.data();
+
+ if self.null_count() == 0 {
+ // No offset in both - just do memcmp
+ if self.offset() == 0 && other.offset() == 0 {
+ let len = self.value_offset(self.len()) as usize;
+ return value_data[..len] == other_value_data[..len];
+ } else {
+ let start = self.value_offset(0) as usize;
+ let other_start = other.value_offset(0) as usize;
+ let len = (self.value_offset(self.len()) -
self.value_offset(0)) as usize;
+ return value_data[start..(start + len)]
+ == other_value_data[other_start..(other_start + len)];
+ }
+ } else {
+ for i in 0..self.len() {
+ if self.is_null(i) {
+ continue;
+ }
+
+ let start = self.value_offset(i) as usize;
+ let other_start = other.value_offset(i) as usize;
+ let len = self.value_length(i) as usize;
+ if value_data[start..(start + len)]
+ != other_value_data[other_start..(other_start + len)]
+ {
+ return false;
+ }
+ }
+ }
+
+ true
+ }
+
+ fn range_equals(
+ &self,
+ other: &dyn Array,
+ start_idx: usize,
+ end_idx: usize,
+ other_start_idx: usize,
+ ) -> bool {
+ assert!(other_start_idx + (end_idx - start_idx) <= other.len());
+ let other = other.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+ let mut j = other_start_idx;
+ for i in start_idx..end_idx {
+ let is_null = self.is_null(i);
+ let other_is_null = other.is_null(j);
+
+ if is_null != other_is_null {
+ return false;
+ }
+
+ if is_null {
+ continue;
+ }
+
+ let start_offset = self.value_offset(i) as usize;
+ let end_offset = self.value_offset(i + 1) as usize;
+ let other_start_offset = other.value_offset(j) as usize;
+ let other_end_offset = other.value_offset(j + 1) as usize;
+
+ if end_offset - start_offset != other_end_offset -
other_start_offset {
+ return false;
+ }
+
+ let value_buf = self.value_data();
+ let other_value_buf = other.value_data();
+ let value_data = value_buf.data();
+ let other_value_data = other_value_buf.data();
+
+ if end_offset - start_offset > 0 {
+ let len = end_offset - start_offset;
+ if value_data[start_offset..(start_offset + len)]
+ !=
other_value_data[other_start_offset..(other_start_offset + len)]
+ {
+ return false;
+ }
+ }
+
+ j += 1;
+ }
+
+ true
+ }
+}
+
impl ArrayEqual for FixedSizeBinaryArray {
fn equals(&self, other: &dyn Array) -> bool {
if !base_equal(&self.data(), &other.data()) {
@@ -799,6 +1079,28 @@ fn value_offset_equal<T: Array + ListArrayOps>(this: &T,
other: &T) -> bool {
true
}
+// Compare if the value offsets are equal between the two list arrays
+fn large_value_offset_equal<T: Array + LargeListArrayOps>(this: &T, other: &T)
-> bool {
+ // Check if offsets differ
+ if this.offset() == 0 && other.offset() == 0 {
+ let offset_data = &this.data_ref().buffers()[0];
+ let other_offset_data = &other.data_ref().buffers()[0];
+ return offset_data.data()[0..((this.len() + 1) * 4)]
+ == other_offset_data.data()[0..((other.len() + 1) * 4)];
+ }
+
+ // The expensive case
+ for i in 0..=this.len() {
+ if this.value_offset_at(i) - this.value_offset_at(0)
+ != other.value_offset_at(i) - other.value_offset_at(0)
+ {
+ return false;
+ }
+ }
+
+ true
+}
+
/// Trait for comparing arrow array with json array
pub trait JsonEqual {
/// Checks whether arrow array equals to json array.
@@ -858,6 +1160,20 @@ impl JsonEqual for ListArray {
}
}
+impl JsonEqual for LargeListArray {
+ fn equals_json(&self, json: &[&Value]) -> bool {
+ if self.len() != json.len() {
+ return false;
+ }
+
+ (0..self.len()).all(|i| match json[i] {
+ Value::Array(v) => self.is_valid(i) &&
self.value(i).equals_json_values(v),
+ Value::Null => self.is_null(i) || self.value_length(i) == 0,
+ _ => false,
+ })
+ }
+}
+
impl PartialEq<Value> for ListArray {
fn eq(&self, json: &Value) -> bool {
match json {
@@ -867,6 +1183,15 @@ impl PartialEq<Value> for ListArray {
}
}
+impl PartialEq<Value> for LargeListArray {
+ fn eq(&self, json: &Value) -> bool {
+ match json {
+ Value::Array(json_array) => self.equals_json_values(json_array),
+ _ => false,
+ }
+ }
+}
+
impl PartialEq<ListArray> for Value {
fn eq(&self, arrow: &ListArray) -> bool {
match self {
@@ -876,6 +1201,15 @@ impl PartialEq<ListArray> for Value {
}
}
+impl PartialEq<LargeListArray> for Value {
+ fn eq(&self, arrow: &LargeListArray) -> bool {
+ match self {
+ Value::Array(json_array) => arrow.equals_json_values(json_array),
+ _ => false,
+ }
+ }
+}
+
impl<T: ArrowPrimitiveType> JsonEqual for DictionaryArray<T> {
fn equals_json(&self, json: &[&Value]) -> bool {
self.keys().zip(json.iter()).all(|aj| match aj {
@@ -1028,6 +1362,44 @@ impl PartialEq<BinaryArray> for Value {
}
}
+impl JsonEqual for LargeBinaryArray {
+ fn equals_json(&self, json: &[&Value]) -> bool {
+ if self.len() != json.len() {
+ return false;
+ }
+
+ (0..self.len()).all(|i| match json[i] {
+ JString(s) => {
+ // binary data is sometimes hex encoded, this checks if bytes
are equal,
+ // and if not converting to hex is attempted
+ self.is_valid(i)
+ && (s.as_str().as_bytes() == self.value(i)
+ || Vec::from_hex(s.as_str()) ==
Ok(self.value(i).to_vec()))
+ }
+ JNull => self.is_null(i),
+ _ => false,
+ })
+ }
+}
+
+impl PartialEq<Value> for LargeBinaryArray {
+ fn eq(&self, json: &Value) -> bool {
+ match json {
+ Value::Array(json_array) => self.equals_json_values(&json_array),
+ _ => false,
+ }
+ }
+}
+
+impl PartialEq<LargeBinaryArray> for Value {
+ fn eq(&self, arrow: &LargeBinaryArray) -> bool {
+ match self {
+ Value::Array(json_array) => arrow.equals_json_values(&json_array),
+ _ => false,
+ }
+ }
+}
+
impl JsonEqual for StringArray {
fn equals_json(&self, json: &[&Value]) -> bool {
if self.len() != json.len() {
@@ -1060,6 +1432,38 @@ impl PartialEq<StringArray> for Value {
}
}
+impl JsonEqual for LargeStringArray {
+ fn equals_json(&self, json: &[&Value]) -> bool {
+ if self.len() != json.len() {
+ return false;
+ }
+
+ (0..self.len()).all(|i| match json[i] {
+ JString(s) => self.is_valid(i) && s.as_str() == self.value(i),
+ JNull => self.is_null(i),
+ _ => false,
+ })
+ }
+}
+
+impl PartialEq<Value> for LargeStringArray {
+ fn eq(&self, json: &Value) -> bool {
+ match json {
+ Value::Array(json_array) => self.equals_json_values(&json_array),
+ _ => false,
+ }
+ }
+}
+
+impl PartialEq<LargeStringArray> for Value {
+ fn eq(&self, arrow: &LargeStringArray) -> bool {
+ match self {
+ Value::Array(json_array) => arrow.equals_json_values(&json_array),
+ _ => false,
+ }
+ }
+}
+
impl JsonEqual for FixedSizeBinaryArray {
fn equals_json(&self, json: &[&Value]) -> bool {
if self.len() != json.len() {
@@ -1456,6 +1860,83 @@ mod tests {
}
#[test]
+ fn test_large_string_equal() {
+ let a = LargeStringArray::from(vec!["hello", "world"]);
+ let b = LargeStringArray::from(vec!["hello", "world"]);
+ assert!(a.equals(&b));
+ assert!(b.equals(&a));
+
+ let b = LargeStringArray::from(vec!["hello", "arrow"]);
+ assert!(!a.equals(&b));
+ assert!(!b.equals(&a));
+
+ // Test the case where null_count > 0
+
+ let a = LargeStringArray::try_from(vec![
+ Some("hello"),
+ None,
+ None,
+ Some("world"),
+ None,
+ None,
+ ])
+ .unwrap();
+
+ let b = LargeStringArray::try_from(vec![
+ Some("hello"),
+ None,
+ None,
+ Some("world"),
+ None,
+ None,
+ ])
+ .unwrap();
+ assert!(a.equals(&b));
+ assert!(b.equals(&a));
+
+ let b = LargeStringArray::try_from(vec![
+ Some("hello"),
+ Some("foo"),
+ None,
+ Some("world"),
+ None,
+ None,
+ ])
+ .unwrap();
+ assert!(!a.equals(&b));
+ assert!(!b.equals(&a));
+
+ let b = LargeStringArray::try_from(vec![
+ Some("hello"),
+ None,
+ None,
+ Some("arrow"),
+ None,
+ None,
+ ])
+ .unwrap();
+ assert!(!a.equals(&b));
+ assert!(!b.equals(&a));
+
+ // Test the case where offset != 0
+
+ let a_slice = a.slice(0, 3);
+ let b_slice = b.slice(0, 3);
+ assert!(a_slice.equals(&*b_slice));
+ assert!(b_slice.equals(&*a_slice));
+
+ let a_slice = a.slice(0, 5);
+ let b_slice = b.slice(0, 5);
+ assert!(!a_slice.equals(&*b_slice));
+ assert!(!b_slice.equals(&*a_slice));
+
+ let a_slice = a.slice(4, 1);
+ let b_slice = b.slice(4, 1);
+ assert!(a_slice.equals(&*b_slice));
+ assert!(b_slice.equals(&*a_slice));
+ }
+
+ #[test]
fn test_struct_equal() {
let string_builder = StringBuilder::new(5);
let int_builder = Int32Builder::new(5);
diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs
index c2bb03d..3abc142 100644
--- a/rust/arrow/src/array/mod.rs
+++ b/rust/arrow/src/array/mod.rs
@@ -105,6 +105,9 @@ pub use self::array::BinaryArray;
pub use self::array::DictionaryArray;
pub use self::array::FixedSizeBinaryArray;
pub use self::array::FixedSizeListArray;
+pub use self::array::LargeBinaryArray;
+pub use self::array::LargeListArray;
+pub use self::array::LargeStringArray;
pub use self::array::ListArray;
pub use self::array::PrimitiveArray;
pub use self::array::StringArray;
@@ -152,6 +155,7 @@ pub type DurationMillisecondArray =
PrimitiveArray<DurationMillisecondType>;
pub type DurationMicrosecondArray = PrimitiveArray<DurationMicrosecondType>;
pub type DurationNanosecondArray = PrimitiveArray<DurationNanosecondType>;
+pub use self::array::LargeListArrayOps;
pub use self::array::ListArrayOps;
pub use self::array::PrimitiveArrayOps;
@@ -193,6 +197,9 @@ pub use self::builder::ArrayBuilder;
pub use self::builder::BinaryBuilder;
pub use self::builder::FixedSizeBinaryBuilder;
pub use self::builder::FixedSizeListBuilder;
+pub use self::builder::LargeBinaryBuilder;
+pub use self::builder::LargeListBuilder;
+pub use self::builder::LargeStringBuilder;
pub use self::builder::ListBuilder;
pub use self::builder::PrimitiveBuilder;
pub use self::builder::PrimitiveDictionaryBuilder;
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index 1bc5027..f140e23 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -116,12 +116,18 @@ pub enum DataType {
/// Opaque binary data of fixed size.
/// Enum parameter specifies the number of bytes per value.
FixedSizeBinary(i32),
+ /// Opaque binary data of variable length and 64-bit offsets.
+ LargeBinary,
/// A variable-length string in Unicode with UTF-8 encoding.
Utf8,
+ /// A variable-length string in Unicode with UFT-8 encoding and 64-bit
offsets.
+ LargeUtf8,
/// A list of some logical data type with variable length.
List(Box<DataType>),
/// A list of some logical data type with fixed length.
FixedSizeList(Box<DataType>, i32),
+ /// A list of some logical data type with variable length and 64-bit
offsets.
+ LargeList(Box<DataType>),
/// A nested datatype that contains a number of sub-fields.
Struct(Vec<Field>),
/// A nested datatype that can represent slots of differing types.
@@ -769,7 +775,9 @@ impl DataType {
Some(s) if s == "null" => Ok(DataType::Null),
Some(s) if s == "bool" => Ok(DataType::Boolean),
Some(s) if s == "binary" => Ok(DataType::Binary),
+ Some(s) if s == "largebinary" => Ok(DataType::LargeBinary),
Some(s) if s == "utf8" => Ok(DataType::Utf8),
+ Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8),
Some(s) if s == "fixedsizebinary" => {
// return a list with any type as its child isn't defined
in the map
if let Some(Value::Number(size)) = map.get("byteWidth") {
@@ -897,6 +905,10 @@ impl DataType {
// return a list with any type as its child isn't defined
in the map
Ok(DataType::List(Box::new(DataType::Boolean)))
}
+ Some(s) if s == "largelist" => {
+ // return a largelist with any type as its child isn't
defined in the map
+ Ok(DataType::LargeList(Box::new(DataType::Boolean)))
+ }
Some(s) if s == "fixedsizelist" => {
// return a list with any type as its child isn't defined
in the map
if let Some(Value::Number(size)) = map.get("listSize") {
@@ -943,13 +955,16 @@ impl DataType {
DataType::Float32 => json!({"name": "floatingpoint", "precision":
"SINGLE"}),
DataType::Float64 => json!({"name": "floatingpoint", "precision":
"DOUBLE"}),
DataType::Utf8 => json!({"name": "utf8"}),
+ DataType::LargeUtf8 => json!({"name": "largeutf8"}),
DataType::Binary => json!({"name": "binary"}),
+ DataType::LargeBinary => json!({"name": "largebinary"}),
DataType::FixedSizeBinary(byte_width) => {
json!({"name": "fixedsizebinary", "byteWidth": byte_width})
}
DataType::Struct(_) => json!({"name": "struct"}),
DataType::Union(_) => json!({"name": "union"}),
DataType::List(_) => json!({ "name": "list"}),
+ DataType::LargeList(_) => json!({ "name": "largelist"}),
DataType::FixedSizeList(_, length) => {
json!({"name":"fixedsizelist", "listSize": length})
}
@@ -1080,18 +1095,22 @@ impl Field {
};
// if data_type is a struct or list, get its children
let data_type = match data_type {
- DataType::List(_) | DataType::FixedSizeList(_, _) => {
- match map.get("children") {
- Some(Value::Array(values)) => {
- if values.len() != 1 {
- return Err(ArrowError::ParseError(
+ DataType::List(_)
+ | DataType::LargeList(_)
+ | DataType::FixedSizeList(_, _) => match
map.get("children") {
+ Some(Value::Array(values)) => {
+ if values.len() != 1 {
+ return Err(ArrowError::ParseError(
"Field 'children' must have one element
for a list data type".to_string(),
));
- }
- match data_type {
+ }
+ match data_type {
DataType::List(_) =>
DataType::List(Box::new(
Self::from(&values[0])?.data_type,
)),
+ DataType::LargeList(_) =>
DataType::LargeList(Box::new(
+ Self::from(&values[0])?.data_type,
+ )),
DataType::FixedSizeList(_, int) => {
DataType::FixedSizeList(
Box::new(Self::from(&values[0])?.data_type),
@@ -1099,22 +1118,21 @@ impl Field {
)
}
_ => unreachable!(
- "Data type should be a list or
fixedsizelist"
+ "Data type should be a list, largelist
or fixedsizelist"
),
}
- }
- Some(_) => {
- return Err(ArrowError::ParseError(
- "Field 'children' must be an
array".to_string(),
- ))
- }
- None => {
- return Err(ArrowError::ParseError(
- "Field missing 'children'
attribute".to_string(),
- ));
- }
}
- }
+ Some(_) => {
+ return Err(ArrowError::ParseError(
+ "Field 'children' must be an
array".to_string(),
+ ))
+ }
+ None => {
+ return Err(ArrowError::ParseError(
+ "Field missing 'children'
attribute".to_string(),
+ ));
+ }
+ },
DataType::Struct(mut fields) => match map.get("children") {
Some(Value::Array(values)) => {
let struct_fields: Result<Vec<Field>> =
@@ -1191,6 +1209,10 @@ impl Field {
let item = Field::new("item", *dtype.clone(), self.nullable);
vec![item.to_json()]
}
+ DataType::LargeList(dtype) => {
+ let item = Field::new("item", *dtype.clone(), self.nullable);
+ vec![item.to_json()]
+ }
DataType::FixedSizeList(dtype, _) => {
let item = Field::new("item", *dtype.clone(), self.nullable);
vec![item.to_json()]
@@ -1307,12 +1329,15 @@ impl Field {
| DataType::Time64(_)
| DataType::Duration(_)
| DataType::Binary
+ | DataType::LargeBinary
| DataType::Interval(_)
+ | DataType::LargeList(_)
| DataType::List(_)
| DataType::Dictionary(_, _)
| DataType::FixedSizeList(_, _)
| DataType::FixedSizeBinary(_)
- | DataType::Utf8 => {
+ | DataType::Utf8
+ | DataType::LargeUtf8 => {
if self.data_type != from.data_type {
return Err(ArrowError::SchemaError(
"Fail to merge schema Field due to conflicting
datatype"
@@ -1846,6 +1871,15 @@ mod tests {
123,
true,
),
+ Field::new("c32", DataType::LargeBinary, true),
+ Field::new("c33", DataType::LargeUtf8, true),
+ Field::new(
+ "c34",
+ DataType::LargeList(Box::new(DataType::LargeList(Box::new(
+ DataType::Struct(vec![]),
+ )))),
+ true,
+ ),
],
metadata,
);
@@ -2193,11 +2227,53 @@ mod tests {
"id": 123,
"indexType": {
"name": "int",
- "isSigned": true,
- "bitWidth": 32
+ "bitWidth": 32,
+ "isSigned": true
},
"isOrdered": true
}
+ },
+ {
+ "name": "c32",
+ "nullable": true,
+ "type": {
+ "name": "largebinary"
+ },
+ "children": []
+ },
+ {
+ "name": "c33",
+ "nullable": true,
+ "type": {
+ "name": "largeutf8"
+ },
+ "children": []
+ },
+ {
+ "name": "c34",
+ "nullable": true,
+ "type": {
+ "name": "largelist"
+ },
+ "children": [
+ {
+ "name": "item",
+ "nullable": true,
+ "type": {
+ "name": "largelist"
+ },
+ "children": [
+ {
+ "name": "item",
+ "nullable": true,
+ "type": {
+ "name": "struct"
+ },
+ "children": []
+ }
+ ]
+ }
+ ]
}
],
"metadata" : {
@@ -2301,10 +2377,12 @@ mod tests {
let schema1 = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float64, true),
+ Field::new("c3", DataType::LargeBinary, true),
]);
let schema2 = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float64, true),
+ Field::new("c3", DataType::LargeBinary, true),
]);
assert_eq!(schema1, schema2);
diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs
index 84d6b39..d3c0732 100644
--- a/rust/arrow/src/ipc/convert.rs
+++ b/rust/arrow/src/ipc/convert.rs
@@ -211,7 +211,9 @@ pub(crate) fn get_data_type(field: ipc::Field,
may_be_dictionary: bool) -> DataT
}
}
ipc::Type::Binary => DataType::Binary,
+ ipc::Type::LargeBinary => DataType::LargeBinary,
ipc::Type::Utf8 => DataType::Utf8,
+ ipc::Type::LargeUtf8 => DataType::LargeUtf8,
ipc::Type::FixedSizeBinary => {
let fsb = field.type_as_fixed_size_binary().unwrap();
DataType::FixedSizeBinary(fsb.byteWidth())
@@ -292,6 +294,15 @@ pub(crate) fn get_data_type(field: ipc::Field,
may_be_dictionary: bool) -> DataT
// returning int16 for now, to test, not sure how to get data type
DataType::List(Box::new(get_data_type(child_field, false)))
}
+ ipc::Type::LargeList => {
+ let children = field.children().unwrap();
+ if children.len() != 1 {
+ panic!("expect a large list to have one child")
+ }
+ let child_field = children.get(0);
+ // returning int16 for now, to test, not sure how to get data type
+ DataType::LargeList(Box::new(get_data_type(child_field, false)))
+ }
ipc::Type::FixedSizeList => {
let children = field.children().unwrap();
if children.len() != 1 {
@@ -401,6 +412,14 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
Some(children),
)
}
+ LargeBinary => {
+ let children = fbb.create_vector(&empty_fields[..]);
+ (
+ ipc::Type::LargeBinary,
+ ipc::LargeBinaryBuilder::new(fbb).finish().as_union_value(),
+ Some(children),
+ )
+ }
Utf8 => {
let children = fbb.create_vector(&empty_fields[..]);
(
@@ -409,6 +428,14 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
Some(children),
)
}
+ LargeUtf8 => {
+ let children = fbb.create_vector(&empty_fields[..]);
+ (
+ ipc::Type::LargeUtf8,
+ ipc::LargeUtf8Builder::new(fbb).finish().as_union_value(),
+ Some(children),
+ )
+ }
FixedSizeBinary(len) => {
let children = fbb.create_vector(&empty_fields[..]);
let mut builder = ipc::FixedSizeBinaryBuilder::new(fbb);
@@ -538,6 +565,27 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
Some(children),
)
}
+ LargeList(ref list_type) => {
+ let inner_types = get_fb_field_type(list_type, fbb);
+ let child = ipc::Field::create(
+ fbb,
+ &ipc::FieldArgs {
+ name: None,
+ nullable: false,
+ type_type: inner_types.0,
+ type_: Some(inner_types.1),
+ dictionary: None,
+ children: inner_types.2,
+ custom_metadata: None,
+ },
+ );
+ let children = fbb.create_vector(&[child]);
+ (
+ ipc::Type::LargeList,
+ ipc::LargeListBuilder::new(fbb).finish().as_union_value(),
+ Some(children),
+ )
+ }
FixedSizeList(ref list_type, len) => {
let inner_types = get_fb_field_type(list_type, fbb);
let child = ipc::Field::create(
diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs
index cb7fb4c..c4b226a 100644
--- a/rust/arrow/src/ipc/reader.rs
+++ b/rust/arrow/src/ipc/reader.rs
@@ -63,7 +63,7 @@ fn create_array(
) -> (ArrayRef, usize, usize) {
use DataType::*;
let array = match data_type {
- Utf8 | Binary => {
+ Utf8 | Binary | LargeBinary | LargeUtf8 => {
let array = create_primitive_array(
&nodes[node_index],
data_type,
@@ -89,7 +89,7 @@ fn create_array(
buffer_index += 2;
array
}
- List(ref list_data_type) => {
+ List(ref list_data_type) | LargeList(ref list_data_type) => {
let list_node = &nodes[node_index];
let list_buffers: Vec<Buffer> = buffers[buffer_index..buffer_index
+ 2]
.iter()
@@ -225,7 +225,7 @@ fn create_primitive_array(
let length = field_node.length() as usize;
let null_count = field_node.null_count() as usize;
let array_data = match data_type {
- Utf8 | Binary => {
+ Utf8 | Binary | LargeBinary | LargeUtf8 => {
// read 3 buffers
let mut builder = ArrayData::builder(data_type.clone())
.len(length)
diff --git a/rust/integration-testing/src/bin/arrow-json-integration-test.rs
b/rust/integration-testing/src/bin/arrow-json-integration-test.rs
index a83e7ed..170892f 100644
--- a/rust/integration-testing/src/bin/arrow-json-integration-test.rs
+++ b/rust/integration-testing/src/bin/arrow-json-integration-test.rs
@@ -20,11 +20,7 @@ use serde_json::Value;
use arrow::util::integration_util::{ArrowJson, ArrowJsonBatch,
ArrowJsonSchema};
-use arrow::array::{
- ArrayRef, BinaryBuilder, BooleanBuilder, FixedSizeBinaryBuilder,
Float32Builder,
- Float64Builder, Int16Builder, Int32Builder, Int64Builder, Int8Builder,
NullArray,
- StringBuilder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder,
-};
+use arrow::array::*;
use arrow::datatypes::{DataType, DateUnit, IntervalUnit, Schema};
use arrow::error::{ArrowError, Result};
use arrow::ipc::reader::FileReader;
@@ -334,6 +330,26 @@ fn record_batch_from_json(
}
Arc::new(b.finish())
}
+ DataType::LargeBinary => {
+ let mut b = LargeBinaryBuilder::new(json_col.count);
+ for (is_valid, value) in json_col
+ .validity
+ .as_ref()
+ .unwrap()
+ .iter()
+ .zip(json_col.data.unwrap())
+ {
+ match is_valid {
+ 1 => {
+ let v = decode(value.as_str().unwrap()).unwrap();
+ b.append_value(&v)
+ }
+ _ => b.append_null(),
+ }
+ .unwrap();
+ }
+ Arc::new(b.finish())
+ }
DataType::Utf8 => {
let mut b = StringBuilder::new(json_col.count);
for (is_valid, value) in json_col
@@ -351,6 +367,23 @@ fn record_batch_from_json(
}
Arc::new(b.finish())
}
+ DataType::LargeUtf8 => {
+ let mut b = LargeStringBuilder::new(json_col.count);
+ for (is_valid, value) in json_col
+ .validity
+ .as_ref()
+ .unwrap()
+ .iter()
+ .zip(json_col.data.unwrap())
+ {
+ match is_valid {
+ 1 => b.append_value(value.as_str().unwrap()),
+ _ => b.append_null(),
+ }
+ .unwrap();
+ }
+ Arc::new(b.finish())
+ }
DataType::FixedSizeBinary(len) => {
let mut b = FixedSizeBinaryBuilder::new(json_col.count, *len);
for (is_valid, value) in json_col
diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs
index 4f56bbb..c31f9db 100644
--- a/rust/parquet/src/arrow/schema.rs
+++ b/rust/parquet/src/arrow/schema.rs
@@ -269,6 +269,9 @@ fn arrow_to_parquet_type(field: &Field) -> Result<Type> {
let dict_field = Field::new(name, *value.clone(),
field.is_nullable());
arrow_to_parquet_type(&dict_field)
}
+ DataType::LargeUtf8 | DataType::LargeBinary | DataType::LargeList(_)
=> {
+ Err(ArrowError("Large arrays not supported".to_string()))
+ }
}
}
/// This struct is used to group methods and data structures used to convert
parquet