This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9d60852  ARROW-8881: [Rust] Add large binary, string and list support
9d60852 is described below

commit 9d60852e7987f6f3dde4273c0343d326e6cc8dfa
Author: Neville Dipale <[email protected]>
AuthorDate: Sun Jul 5 18:43:06 2020 +0200

    ARROW-8881: [Rust] Add large binary, string and list support
    
    Similar to other implementations, this creates binary, string and list 
arrays with `i64` offsets instead of `i32`. Behaviourally, everything's the 
same as the `i32` counterparts, except for the larger array offsets.
    
    I'll look at the relevant integration tests separately.
    
    Code's a bit repetitive, but given that we currently don't have much review 
bandwidth; I'd rather not spend a lot of time on it now, as I want to try 
complete other integration-related tasks in time for 1.0.0.
    
    Closes #7613 from nevi-me/rust-large-lists
    
    Authored-by: Neville Dipale <[email protected]>
    Signed-off-by: Neville Dipale <[email protected]>
---
 rust/arrow/src/array/array.rs                      | 756 ++++++++++++++++++---
 rust/arrow/src/array/builder.rs                    | 533 +++++++++++++++
 rust/arrow/src/array/equal.rs                      | 481 +++++++++++++
 rust/arrow/src/array/mod.rs                        |   7 +
 rust/arrow/src/datatypes.rs                        | 124 +++-
 rust/arrow/src/ipc/convert.rs                      |  48 ++
 rust/arrow/src/ipc/reader.rs                       |   6 +-
 .../src/bin/arrow-json-integration-test.rs         |  43 +-
 rust/parquet/src/arrow/schema.rs                   |   3 +
 9 files changed, 1866 insertions(+), 135 deletions(-)

diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index 18e11ce..c9eec1e 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -266,11 +266,14 @@ pub fn make_array(data: ArrayDataRef) -> ArrayRef {
             Arc::new(DurationNanosecondArray::from(data)) as ArrayRef
         }
         DataType::Binary => Arc::new(BinaryArray::from(data)) as ArrayRef,
+        DataType::LargeBinary => Arc::new(LargeBinaryArray::from(data)) as 
ArrayRef,
         DataType::FixedSizeBinary(_) => {
             Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef
         }
         DataType::Utf8 => Arc::new(StringArray::from(data)) as ArrayRef,
+        DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as 
ArrayRef,
         DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef,
+        DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as 
ArrayRef,
         DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef,
         DataType::Union(_) => Arc::new(UnionArray::from(data)) as ArrayRef,
         DataType::FixedSizeList(_, _) => {
@@ -951,6 +954,30 @@ impl ListArrayOps for FixedSizeBinaryArray {
     }
 }
 
+/// Common operations for large List types, currently `LargeListArray`, 
`LargeBinaryArray`
+///  and `LargeStringArray`
+pub trait LargeListArrayOps {
+    fn value_offset_at(&self, i: usize) -> i64;
+}
+
+impl LargeListArrayOps for LargeBinaryArray {
+    fn value_offset_at(&self, i: usize) -> i64 {
+        self.value_offset_at(i)
+    }
+}
+
+impl LargeListArrayOps for LargeStringArray {
+    fn value_offset_at(&self, i: usize) -> i64 {
+        self.value_offset_at(i)
+    }
+}
+
+impl LargeListArrayOps for LargeListArray {
+    fn value_offset_at(&self, i: usize) -> i64 {
+        self.value_offset_at(i)
+    }
+}
+
 /// A list array where each element is a variable-sized sequence of values 
with the same
 /// type.
 pub struct ListArray {
@@ -959,6 +986,14 @@ pub struct ListArray {
     value_offsets: RawPtrBox<i32>,
 }
 
+/// A list array where each element is a variable-sized sequence of values 
with the same
+/// type.
+pub struct LargeListArray {
+    data: ArrayDataRef,
+    values: ArrayRef,
+    value_offsets: RawPtrBox<i64>,
+}
+
 impl ListArray {
     /// Returns a reference to the values of this list.
     pub fn values(&self) -> ArrayRef {
@@ -999,6 +1034,46 @@ impl ListArray {
     }
 }
 
+impl LargeListArray {
+    /// Returns a reference to the values of this list.
+    pub fn values(&self) -> ArrayRef {
+        self.values.clone()
+    }
+
+    /// Returns a clone of the value type of this list.
+    pub fn value_type(&self) -> DataType {
+        self.values.data().data_type().clone()
+    }
+
+    /// Returns ith value of this list array.
+    pub fn value(&self, i: usize) -> ArrayRef {
+        self.values
+            .slice(self.value_offset(i) as usize, self.value_length(i) as 
usize)
+    }
+
+    /// Returns the offset for value at index `i`.
+    ///
+    /// Note this doesn't do any bound checking, for performance reason.
+    #[inline]
+    pub fn value_offset(&self, i: usize) -> i64 {
+        self.value_offset_at(self.data.offset() + i)
+    }
+
+    /// Returns the length for value at index `i`.
+    ///
+    /// Note this doesn't do any bound checking, for performance reason.
+    #[inline]
+    pub fn value_length(&self, mut i: usize) -> i64 {
+        i += self.data.offset();
+        self.value_offset_at(i + 1) - self.value_offset_at(i)
+    }
+
+    #[inline]
+    fn value_offset_at(&self, i: usize) -> i64 {
+        unsafe { *self.value_offsets.get().add(i) }
+    }
+}
+
 /// Constructs a `ListArray` from an array data reference.
 impl From<ArrayDataRef> for ListArray {
     fn from(data: ArrayDataRef) -> Self {
@@ -1030,6 +1105,37 @@ impl From<ArrayDataRef> for ListArray {
     }
 }
 
+/// Constructs a `LargeListArray` from an array data reference.
+impl From<ArrayDataRef> for LargeListArray {
+    fn from(data: ArrayDataRef) -> Self {
+        assert_eq!(
+            data.buffers().len(),
+            1,
+            "LargeListArray data should contain a single buffer only (value 
offsets)"
+        );
+        assert_eq!(
+            data.child_data().len(),
+            1,
+            "LargeListArray should contain a single child array (values array)"
+        );
+        let values = make_array(data.child_data()[0].clone());
+        let raw_value_offsets = data.buffers()[0].raw_data();
+        assert!(
+            memory::is_aligned(raw_value_offsets, mem::align_of::<i64>()),
+            "memory is not aligned"
+        );
+        let value_offsets = raw_value_offsets as *const i64;
+        unsafe {
+            assert_eq!(*value_offsets.offset(0), 0, "offsets do not start at 
zero");
+        }
+        Self {
+            data,
+            values,
+            value_offsets: RawPtrBox::new(value_offsets),
+        }
+    }
+}
+
 impl Array for ListArray {
     fn as_any(&self) -> &Any {
         self
@@ -1044,6 +1150,20 @@ impl Array for ListArray {
     }
 }
 
+impl Array for LargeListArray {
+    fn as_any(&self) -> &Any {
+        self
+    }
+
+    fn data(&self) -> ArrayDataRef {
+        self.data.clone()
+    }
+
+    fn data_ref(&self) -> &ArrayDataRef {
+        &self.data
+    }
+}
+
 // Helper function for printing potentially long arrays.
 fn print_long_array<A, F>(array: &A, f: &mut fmt::Formatter, print_item: F) -> 
fmt::Result
 where
@@ -1086,6 +1206,16 @@ impl fmt::Debug for ListArray {
     }
 }
 
+impl fmt::Debug for LargeListArray {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "LargeListArray\n[\n")?;
+        print_long_array(self, f, |array, index, f| {
+            fmt::Debug::fmt(&array.value(index), f)
+        })?;
+        write!(f, "]")
+    }
+}
+
 /// A list array where each element is a fixed-size sequence of values with 
the same
 /// type.
 pub struct FixedSizeListArray {
@@ -1194,20 +1324,69 @@ impl fmt::Debug for FixedSizeListArray {
     }
 }
 
-/// A type of `ListArray` whose elements are binaries.
-pub struct BinaryArray {
-    data: ArrayDataRef,
-    value_offsets: RawPtrBox<i32>,
-    value_data: RawPtrBox<u8>,
-}
+macro_rules! make_binary_type {
+    ($name:ident, $offset_ty:ty) => {
+        pub struct $name {
+            data: ArrayDataRef,
+            value_offsets: RawPtrBox<$offset_ty>,
+            value_data: RawPtrBox<u8>,
+        }
 
-/// A type of `ListArray` whose elements are UTF8 strings.
-pub struct StringArray {
-    data: ArrayDataRef,
-    value_offsets: RawPtrBox<i32>,
-    value_data: RawPtrBox<u8>,
+        impl $name {
+            /// Returns the offset for the element at index `i`.
+            ///
+            /// Note this doesn't do any bound checking, for performance 
reason.
+            #[inline]
+            pub fn value_offset(&self, i: usize) -> $offset_ty {
+                self.value_offset_at(self.data.offset() + i)
+            }
+
+            /// Returns the length for the element at index `i`.
+            ///
+            /// Note this doesn't do any bound checking, for performance 
reason.
+            #[inline]
+            pub fn value_length(&self, mut i: usize) -> $offset_ty {
+                i += self.data.offset();
+                self.value_offset_at(i + 1) - self.value_offset_at(i)
+            }
+
+            /// Returns a clone of the value offset buffer
+            pub fn value_offsets(&self) -> Buffer {
+                self.data.buffers()[0].clone()
+            }
+
+            /// Returns a clone of the value data buffer
+            pub fn value_data(&self) -> Buffer {
+                self.data.buffers()[1].clone()
+            }
+
+            #[inline]
+            fn value_offset_at(&self, i: usize) -> $offset_ty {
+                unsafe { *self.value_offsets.get().add(i) }
+            }
+        }
+
+        impl Array for $name {
+            fn as_any(&self) -> &Any {
+                self
+            }
+
+            fn data(&self) -> ArrayDataRef {
+                self.data.clone()
+            }
+
+            fn data_ref(&self) -> &ArrayDataRef {
+                &self.data
+            }
+        }
+    };
 }
 
+make_binary_type!(BinaryArray, i32);
+make_binary_type!(LargeBinaryArray, i64);
+make_binary_type!(StringArray, i32);
+make_binary_type!(LargeStringArray, i64);
+
 /// A type of `FixedSizeListArray` whose elements are binaries.
 pub struct FixedSizeBinaryArray {
     data: ArrayDataRef,
@@ -1229,41 +1408,29 @@ impl BinaryArray {
         }
     }
 
-    /// Returns the offset for the element at index `i`.
-    ///
-    /// Note this doesn't do any bound checking, for performance reason.
-    #[inline]
-    pub fn value_offset(&self, i: usize) -> i32 {
-        self.value_offset_at(self.data.offset() + i)
-    }
-
-    /// Returns the length for the element at index `i`.
-    ///
-    /// Note this doesn't do any bound checking, for performance reason.
-    #[inline]
-    pub fn value_length(&self, mut i: usize) -> i32 {
-        i += self.data.offset();
-        self.value_offset_at(i + 1) - self.value_offset_at(i)
-    }
-
-    /// Returns a clone of the value offset buffer
-    pub fn value_offsets(&self) -> Buffer {
-        self.data.buffers()[0].clone()
-    }
-
-    /// Returns a clone of the value data buffer
-    pub fn value_data(&self) -> Buffer {
-        self.data.buffers()[1].clone()
+    /// Returns a new binary array builder
+    pub fn builder(capacity: usize) -> BinaryBuilder {
+        BinaryBuilder::new(capacity)
     }
+}
 
-    #[inline]
-    fn value_offset_at(&self, i: usize) -> i32 {
-        unsafe { *self.value_offsets.get().add(i) }
+impl LargeBinaryArray {
+    /// Returns the element at index `i` as a byte slice.
+    pub fn value(&self, i: usize) -> &[u8] {
+        assert!(i < self.data.len(), "LargeBinaryArray out of bounds access");
+        let offset = i.checked_add(self.data.offset()).unwrap();
+        unsafe {
+            let pos = self.value_offset_at(offset);
+            std::slice::from_raw_parts(
+                self.value_data.get().offset(pos as isize),
+                (self.value_offset_at(offset + 1) - pos) as usize,
+            )
+        }
     }
 
-    // Returns a new binary array builder
-    pub fn builder(capacity: usize) -> BinaryBuilder {
-        BinaryBuilder::new(capacity)
+    /// Returns a new large binary array builder
+    pub fn builder(capacity: usize) -> LargeBinaryBuilder {
+        LargeBinaryBuilder::new(capacity)
     }
 }
 
@@ -1283,41 +1450,31 @@ impl StringArray {
         }
     }
 
-    /// Returns the offset for the element at index `i`.
-    ///
-    /// Note this doesn't do any bound checking, for performance reason.
-    #[inline]
-    pub fn value_offset(&self, i: usize) -> i32 {
-        self.value_offset_at(self.data.offset() + i)
-    }
-
-    /// Returns the length for the element at index `i`.
-    ///
-    /// Note this doesn't do any bound checking, for performance reason.
-    #[inline]
-    pub fn value_length(&self, mut i: usize) -> i32 {
-        i += self.data.offset();
-        self.value_offset_at(i + 1) - self.value_offset_at(i)
-    }
-
-    /// Returns a clone of the value offset buffer
-    pub fn value_offsets(&self) -> Buffer {
-        self.data.buffers()[0].clone()
+    /// Returns a new string array builder
+    pub fn builder(capacity: usize) -> StringBuilder {
+        StringBuilder::new(capacity)
     }
+}
 
-    /// Returns a clone of the value data buffer
-    pub fn value_data(&self) -> Buffer {
-        self.data.buffers()[1].clone()
-    }
+impl LargeStringArray {
+    /// Returns the element at index `i` as a string slice.
+    pub fn value(&self, i: usize) -> &str {
+        assert!(i < self.data.len(), "LargeStringArray out of bounds access");
+        let offset = i.checked_add(self.data.offset()).unwrap();
+        unsafe {
+            let pos = self.value_offset_at(offset);
+            let slice = std::slice::from_raw_parts(
+                self.value_data.get().offset(pos as isize),
+                (self.value_offset_at(offset + 1) - pos) as usize,
+            );
 
-    #[inline]
-    fn value_offset_at(&self, i: usize) -> i32 {
-        unsafe { *self.value_offsets.get().add(i) }
+            std::str::from_utf8_unchecked(slice)
+        }
     }
 
-    // Returns a new string array builder
-    pub fn builder(capacity: usize) -> StringBuilder {
-        StringBuilder::new(capacity)
+    // Returns a new large string array builder
+    pub fn builder(capacity: usize) -> LargeStringBuilder {
+        LargeStringBuilder::new(capacity)
     }
 }
 
@@ -1386,6 +1543,27 @@ impl From<ArrayDataRef> for BinaryArray {
     }
 }
 
+impl From<ArrayDataRef> for LargeBinaryArray {
+    fn from(data: ArrayDataRef) -> Self {
+        assert_eq!(
+            data.buffers().len(),
+            2,
+            "LargeBinaryArray data should contain 2 buffers only (offsets and 
values)"
+        );
+        let raw_value_offsets = data.buffers()[0].raw_data();
+        assert!(
+            memory::is_aligned(raw_value_offsets, mem::align_of::<i64>()),
+            "memory is not aligned"
+        );
+        let value_data = data.buffers()[1].raw_data();
+        Self {
+            data,
+            value_offsets: RawPtrBox::new(raw_value_offsets as *const i64),
+            value_data: RawPtrBox::new(value_data),
+        }
+    }
+}
+
 impl From<ArrayDataRef> for StringArray {
     fn from(data: ArrayDataRef) -> Self {
         assert_eq!(
@@ -1407,6 +1585,27 @@ impl From<ArrayDataRef> for StringArray {
     }
 }
 
+impl From<ArrayDataRef> for LargeStringArray {
+    fn from(data: ArrayDataRef) -> Self {
+        assert_eq!(
+            data.buffers().len(),
+            2,
+            "LargeStringArray data should contain 2 buffers only (offsets and 
values)"
+        );
+        let raw_value_offsets = data.buffers()[0].raw_data();
+        assert!(
+            memory::is_aligned(raw_value_offsets, mem::align_of::<i64>()),
+            "memory is not aligned"
+        );
+        let value_data = data.buffers()[1].raw_data();
+        Self {
+            data,
+            value_offsets: RawPtrBox::new(raw_value_offsets as *const i64),
+            value_data: RawPtrBox::new(value_data),
+        }
+    }
+}
+
 impl From<ArrayDataRef> for FixedSizeBinaryArray {
     fn from(data: ArrayDataRef) -> Self {
         assert_eq!(
@@ -1447,6 +1646,26 @@ impl<'a> From<Vec<&'a str>> for StringArray {
     }
 }
 
+impl<'a> From<Vec<&'a str>> for LargeStringArray {
+    fn from(v: Vec<&'a str>) -> Self {
+        let mut offsets = Vec::with_capacity(v.len() + 1);
+        let mut values = Vec::new();
+        let mut length_so_far = 0;
+        offsets.push(length_so_far);
+        for s in &v {
+            length_so_far += s.len() as i64;
+            offsets.push(length_so_far as i64);
+            values.extend_from_slice(s.as_bytes());
+        }
+        let array_data = ArrayData::builder(DataType::LargeUtf8)
+            .len(v.len())
+            .add_buffer(Buffer::from(offsets.to_byte_slice()))
+            .add_buffer(Buffer::from(&values[..]))
+            .build();
+        LargeStringArray::from(array_data)
+    }
+}
+
 impl From<Vec<&[u8]>> for BinaryArray {
     fn from(v: Vec<&[u8]>) -> Self {
         let mut offsets = Vec::with_capacity(v.len() + 1);
@@ -1467,6 +1686,26 @@ impl From<Vec<&[u8]>> for BinaryArray {
     }
 }
 
+impl From<Vec<&[u8]>> for LargeBinaryArray {
+    fn from(v: Vec<&[u8]>) -> Self {
+        let mut offsets = Vec::with_capacity(v.len() + 1);
+        let mut values = Vec::new();
+        let mut length_so_far = 0;
+        offsets.push(length_so_far);
+        for s in &v {
+            length_so_far += s.len() as i64;
+            offsets.push(length_so_far as i64);
+            values.extend_from_slice(s);
+        }
+        let array_data = ArrayData::builder(DataType::LargeBinary)
+            .len(v.len())
+            .add_buffer(Buffer::from(offsets.to_byte_slice()))
+            .add_buffer(Buffer::from(&values[..]))
+            .build();
+        LargeBinaryArray::from(array_data)
+    }
+}
+
 impl<'a> TryFrom<Vec<Option<&'a str>>> for StringArray {
     type Error = ArrowError;
 
@@ -1483,6 +1722,22 @@ impl<'a> TryFrom<Vec<Option<&'a str>>> for StringArray {
     }
 }
 
+impl<'a> TryFrom<Vec<Option<&'a str>>> for LargeStringArray {
+    type Error = ArrowError;
+
+    fn try_from(v: Vec<Option<&'a str>>) -> Result<Self> {
+        let mut builder = LargeStringBuilder::new(v.len());
+        for val in v {
+            if let Some(s) = val {
+                builder.append_value(s)?;
+            } else {
+                builder.append(false)?;
+            }
+        }
+        Ok(builder.finish())
+    }
+}
+
 /// Creates a `BinaryArray` from `List<u8>` array
 impl From<ListArray> for BinaryArray {
     fn from(v: ListArray) -> Self {
@@ -1543,6 +1798,66 @@ impl From<ListArray> for StringArray {
     }
 }
 
+/// Creates a `LargeBinaryArray` from `LargeList<u8>` array
+impl From<LargeListArray> for LargeBinaryArray {
+    fn from(v: LargeListArray) -> Self {
+        assert_eq!(
+            v.data().child_data()[0].child_data().len(),
+            0,
+            "LargeBinaryArray can only be created from list array of u8 values 
\
+             (i.e. LargeList<PrimitiveArray<u8>>)."
+        );
+        assert_eq!(
+            v.data().child_data()[0].data_type(),
+            &DataType::UInt8,
+            "LargeBinaryArray can only be created from LargeList<u8> arrays, 
mismatched data types."
+        );
+
+        let mut builder = ArrayData::builder(DataType::LargeBinary)
+            .len(v.len())
+            .add_buffer(v.data().buffers()[0].clone())
+            .add_buffer(v.data().child_data()[0].buffers()[0].clone());
+        if let Some(bitmap) = v.data().null_bitmap() {
+            builder = builder
+                .null_count(v.data().null_count())
+                .null_bit_buffer(bitmap.bits.clone())
+        }
+
+        let data = builder.build();
+        Self::from(data)
+    }
+}
+
+/// Creates a `LargeStringArray` from `LargeList<u8>` array
+impl From<LargeListArray> for LargeStringArray {
+    fn from(v: LargeListArray) -> Self {
+        assert_eq!(
+            v.data().child_data()[0].child_data().len(),
+            0,
+            "LargeStringArray can only be created from list array of u8 values 
\
+             (i.e. LargeList<PrimitiveArray<u8>>)."
+        );
+        assert_eq!(
+            v.data().child_data()[0].data_type(),
+            &DataType::UInt8,
+            "LargeStringArray can only be created from LargeList<u8> arrays, 
mismatched data types."
+        );
+
+        let mut builder = ArrayData::builder(DataType::LargeUtf8)
+            .len(v.len())
+            .add_buffer(v.data().buffers()[0].clone())
+            .add_buffer(v.data().child_data()[0].buffers()[0].clone());
+        if let Some(bitmap) = v.data().null_bitmap() {
+            builder = builder
+                .null_count(v.data().null_count())
+                .null_bit_buffer(bitmap.bits.clone())
+        }
+
+        let data = builder.build();
+        Self::from(data)
+    }
+}
+
 /// Creates a `FixedSizeBinaryArray` from `FixedSizeList<u8>` array
 impl From<FixedSizeListArray> for FixedSizeBinaryArray {
     fn from(v: FixedSizeListArray) -> Self {
@@ -1572,9 +1887,19 @@ impl From<FixedSizeListArray> for FixedSizeBinaryArray {
     }
 }
 
-impl fmt::Debug for BinaryArray {
+impl fmt::Debug for BinaryArray {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "BinaryArray\n[\n")?;
+        print_long_array(self, f, |array, index, f| {
+            fmt::Debug::fmt(&array.value(index), f)
+        })?;
+        write!(f, "]")
+    }
+}
+
+impl fmt::Debug for LargeBinaryArray {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "BinaryArray\n[\n")?;
+        write!(f, "LargeBinaryArray\n[\n")?;
         print_long_array(self, f, |array, index, f| {
             fmt::Debug::fmt(&array.value(index), f)
         })?;
@@ -1592,9 +1917,9 @@ impl fmt::Debug for StringArray {
     }
 }
 
-impl fmt::Debug for FixedSizeBinaryArray {
+impl fmt::Debug for LargeStringArray {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?;
+        write!(f, "LargeStringArray\n[\n")?;
         print_long_array(self, f, |array, index, f| {
             fmt::Debug::fmt(&array.value(index), f)
         })?;
@@ -1602,31 +1927,13 @@ impl fmt::Debug for FixedSizeBinaryArray {
     }
 }
 
-impl Array for BinaryArray {
-    fn as_any(&self) -> &Any {
-        self
-    }
-
-    fn data(&self) -> ArrayDataRef {
-        self.data.clone()
-    }
-
-    fn data_ref(&self) -> &ArrayDataRef {
-        &self.data
-    }
-}
-
-impl Array for StringArray {
-    fn as_any(&self) -> &Any {
-        self
-    }
-
-    fn data(&self) -> ArrayDataRef {
-        self.data.clone()
-    }
-
-    fn data_ref(&self) -> &ArrayDataRef {
-        &self.data
+impl fmt::Debug for FixedSizeBinaryArray {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?;
+        print_long_array(self, f, |array, index, f| {
+            fmt::Debug::fmt(&array.value(index), f)
+        })?;
+        write!(f, "]")
     }
 }
 
@@ -2537,6 +2844,75 @@ mod tests {
     }
 
     #[test]
+    fn test_large_list_array() {
+        // Construct a value array
+        let value_data = ArrayData::builder(DataType::Int32)
+            .len(8)
+            .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 
7].to_byte_slice()))
+            .build();
+
+        // Construct a buffer for value offsets, for the nested array:
+        //  [[0, 1, 2], [3, 4, 5], [6, 7]]
+        let value_offsets = Buffer::from(&[0i64, 3, 6, 8].to_byte_slice());
+
+        // Construct a list array from the above two
+        let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+        let list_data = ArrayData::builder(list_data_type.clone())
+            .len(3)
+            .add_buffer(value_offsets.clone())
+            .add_child_data(value_data.clone())
+            .build();
+        let list_array = LargeListArray::from(list_data);
+
+        let values = list_array.values();
+        assert_eq!(value_data, values.data());
+        assert_eq!(DataType::Int32, list_array.value_type());
+        assert_eq!(3, list_array.len());
+        assert_eq!(0, list_array.null_count());
+        assert_eq!(6, list_array.value_offset(2));
+        assert_eq!(2, list_array.value_length(2));
+        assert_eq!(
+            0,
+            list_array
+                .value(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap()
+                .value(0)
+        );
+        for i in 0..3 {
+            assert!(list_array.is_valid(i));
+            assert!(!list_array.is_null(i));
+        }
+
+        // Now test with a non-zero offset
+        let list_data = ArrayData::builder(list_data_type)
+            .len(3)
+            .offset(1)
+            .add_buffer(value_offsets)
+            .add_child_data(value_data.clone())
+            .build();
+        let list_array = LargeListArray::from(list_data);
+
+        let values = list_array.values();
+        assert_eq!(value_data, values.data());
+        assert_eq!(DataType::Int32, list_array.value_type());
+        assert_eq!(3, list_array.len());
+        assert_eq!(0, list_array.null_count());
+        assert_eq!(6, list_array.value_offset(1));
+        assert_eq!(2, list_array.value_length(1));
+        assert_eq!(
+            3,
+            list_array
+                .value(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap()
+                .value(0)
+        );
+    }
+
+    #[test]
     fn test_dictionary_array() {
         // Construct a value array
         let value_data = ArrayData::builder(DataType::Int8)
@@ -2747,6 +3123,72 @@ mod tests {
     }
 
     #[test]
+    fn test_large_list_array_slice() {
+        // Construct a value array
+        let value_data = ArrayData::builder(DataType::Int32)
+            .len(10)
+            .add_buffer(Buffer::from(
+                &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9].to_byte_slice(),
+            ))
+            .build();
+
+        // Construct a buffer for value offsets, for the nested array:
+        //  [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]]
+        let value_offsets =
+            Buffer::from(&[0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10].to_byte_slice());
+        // 01011001 00000001
+        let mut null_bits: [u8; 2] = [0; 2];
+        bit_util::set_bit(&mut null_bits, 0);
+        bit_util::set_bit(&mut null_bits, 3);
+        bit_util::set_bit(&mut null_bits, 4);
+        bit_util::set_bit(&mut null_bits, 6);
+        bit_util::set_bit(&mut null_bits, 8);
+
+        // Construct a list array from the above two
+        let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+        let list_data = ArrayData::builder(list_data_type.clone())
+            .len(9)
+            .add_buffer(value_offsets.clone())
+            .add_child_data(value_data.clone())
+            .null_bit_buffer(Buffer::from(null_bits))
+            .build();
+        let list_array = LargeListArray::from(list_data);
+
+        let values = list_array.values();
+        assert_eq!(value_data, values.data());
+        assert_eq!(DataType::Int32, list_array.value_type());
+        assert_eq!(9, list_array.len());
+        assert_eq!(4, list_array.null_count());
+        assert_eq!(2, list_array.value_offset(3));
+        assert_eq!(2, list_array.value_length(3));
+
+        let sliced_array = list_array.slice(1, 6);
+        assert_eq!(6, sliced_array.len());
+        assert_eq!(1, sliced_array.offset());
+        assert_eq!(3, sliced_array.null_count());
+
+        for i in 0..sliced_array.len() {
+            if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+                assert!(sliced_array.is_valid(i));
+            } else {
+                assert!(sliced_array.is_null(i));
+            }
+        }
+
+        // Check offset and length for each non-null value.
+        let sliced_list_array = sliced_array
+            .as_any()
+            .downcast_ref::<LargeListArray>()
+            .unwrap();
+        assert_eq!(2, sliced_list_array.value_offset(2));
+        assert_eq!(2, sliced_list_array.value_length(2));
+        assert_eq!(4, sliced_list_array.value_offset(3));
+        assert_eq!(2, sliced_list_array.value_length(3));
+        assert_eq!(6, sliced_list_array.value_offset(5));
+        assert_eq!(3, sliced_list_array.value_length(5));
+    }
+
+    #[test]
     fn test_fixed_size_list_array_slice() {
         // Construct a value array
         let value_data = ArrayData::builder(DataType::Int32)
@@ -2902,6 +3344,53 @@ mod tests {
     }
 
     #[test]
+    fn test_large_binary_array() {
+        let values: [u8; 12] = [
+            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', 
b't',
+        ];
+        let offsets: [i64; 4] = [0, 5, 5, 12];
+
+        // Array data: ["hello", "", "parquet"]
+        let array_data = ArrayData::builder(DataType::Binary)
+            .len(3)
+            .add_buffer(Buffer::from(offsets.to_byte_slice()))
+            .add_buffer(Buffer::from(&values[..]))
+            .build();
+        let binary_array = LargeBinaryArray::from(array_data);
+        assert_eq!(3, binary_array.len());
+        assert_eq!(0, binary_array.null_count());
+        assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
+        assert_eq!([] as [u8; 0], binary_array.value(1));
+        assert_eq!(
+            [b'p', b'a', b'r', b'q', b'u', b'e', b't'],
+            binary_array.value(2)
+        );
+        assert_eq!(5, binary_array.value_offset(2));
+        assert_eq!(7, binary_array.value_length(2));
+        for i in 0..3 {
+            assert!(binary_array.is_valid(i));
+            assert!(!binary_array.is_null(i));
+        }
+
+        // Test binary array with offset
+        let array_data = ArrayData::builder(DataType::LargeBinary)
+            .len(4)
+            .offset(1)
+            .add_buffer(Buffer::from(offsets.to_byte_slice()))
+            .add_buffer(Buffer::from(&values[..]))
+            .build();
+        let binary_array = LargeBinaryArray::from(array_data);
+        assert_eq!(
+            [b'p', b'a', b'r', b'q', b'u', b'e', b't'],
+            binary_array.value(1)
+        );
+        assert_eq!(5, binary_array.value_offset(0));
+        assert_eq!(0, binary_array.value_length(0));
+        assert_eq!(5, binary_array.value_offset(1));
+        assert_eq!(7, binary_array.value_length(1));
+    }
+
+    #[test]
     fn test_binary_array_from_list_array() {
         let values: [u8; 12] = [
             b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', 
b't',
@@ -2941,6 +3430,45 @@ mod tests {
     }
 
     #[test]
+    fn test_large_binary_array_from_list_array() {
+        let values: [u8; 12] = [
+            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', 
b't',
+        ];
+        let values_data = ArrayData::builder(DataType::UInt8)
+            .len(12)
+            .add_buffer(Buffer::from(&values[..]))
+            .build();
+        let offsets: [i64; 4] = [0, 5, 5, 12];
+
+        // Array data: ["hello", "", "parquet"]
+        let array_data1 = ArrayData::builder(DataType::LargeBinary)
+            .len(3)
+            .add_buffer(Buffer::from(offsets.to_byte_slice()))
+            .add_buffer(Buffer::from(&values[..]))
+            .build();
+        let binary_array1 = LargeBinaryArray::from(array_data1);
+
+        let array_data2 = ArrayData::builder(DataType::Binary)
+            .len(3)
+            .add_buffer(Buffer::from(offsets.to_byte_slice()))
+            .add_child_data(values_data)
+            .build();
+        let list_array = LargeListArray::from(array_data2);
+        let binary_array2 = LargeBinaryArray::from(list_array);
+
+        assert_eq!(2, binary_array2.data().buffers().len());
+        assert_eq!(0, binary_array2.data().child_data().len());
+
+        assert_eq!(binary_array1.len(), binary_array2.len());
+        assert_eq!(binary_array1.null_count(), binary_array2.null_count());
+        for i in 0..binary_array1.len() {
+            assert_eq!(binary_array1.value(i), binary_array2.value(i));
+            assert_eq!(binary_array1.value_offset(i), 
binary_array2.value_offset(i));
+            assert_eq!(binary_array1.value_length(i), 
binary_array2.value_length(i));
+        }
+    }
+
+    #[test]
     fn test_string_array_from_u8_slice() {
         let values: Vec<&str> = vec!["hello", "", "parquet"];
 
@@ -2961,6 +3489,26 @@ mod tests {
     }
 
     #[test]
+    fn test_large_string_array_from_u8_slice() {
+        let values: Vec<&str> = vec!["hello", "", "parquet"];
+
+        // Array data: ["hello", "", "parquet"]
+        let string_array = LargeStringArray::from(values);
+
+        assert_eq!(3, string_array.len());
+        assert_eq!(0, string_array.null_count());
+        assert_eq!("hello", string_array.value(0));
+        assert_eq!("", string_array.value(1));
+        assert_eq!("parquet", string_array.value(2));
+        assert_eq!(5, string_array.value_offset(2));
+        assert_eq!(7, string_array.value_length(2));
+        for i in 0..3 {
+            assert!(string_array.is_valid(i));
+            assert!(!string_array.is_null(i));
+        }
+    }
+
+    #[test]
     fn test_nested_string_array() {
         let string_builder = StringBuilder::new(3);
         let mut list_of_string_builder = ListBuilder::new(string_builder);
@@ -3164,10 +3712,10 @@ mod tests {
     }
 
     #[test]
-    fn test_fixed_size_binary_array_fmt_debug() {
-        let arr: StringArray = vec!["hello", "arrow"].into();
+    fn test_large_string_array_fmt_debug() {
+        let arr: LargeStringArray = vec!["hello", "arrow"].into();
         assert_eq!(
-            "StringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
+            "LargeStringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
             format!("{:?}", arr)
         );
     }
diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs
index 6e8f196..691f7cd 100644
--- a/rust/arrow/src/array/builder.rs
+++ b/rust/arrow/src/array/builder.rs
@@ -824,6 +824,207 @@ where
 }
 
 ///  Array builder for `ListArray`
+pub struct LargeListBuilder<T: ArrayBuilder> {
+    offsets_builder: Int64BufferBuilder,
+    bitmap_builder: BooleanBufferBuilder,
+    values_builder: T,
+    len: usize,
+}
+
+impl<T: ArrayBuilder> LargeListBuilder<T> {
+    /// Creates a new `LargeListArrayBuilder` from a given values array builder
+    pub fn new(values_builder: T) -> Self {
+        let capacity = values_builder.len();
+        Self::with_capacity(values_builder, capacity)
+    }
+
+    /// Creates a new `LargeListArrayBuilder` from a given values array builder
+    /// `capacity` is the number of items to pre-allocate space for in this 
builder
+    pub fn with_capacity(values_builder: T, capacity: usize) -> Self {
+        let mut offsets_builder = Int64BufferBuilder::new(capacity + 1);
+        offsets_builder.append(0).unwrap();
+        Self {
+            offsets_builder,
+            bitmap_builder: BooleanBufferBuilder::new(capacity),
+            values_builder,
+            len: 0,
+        }
+    }
+}
+
+impl<T: ArrayBuilder> ArrayBuilder for LargeListBuilder<T>
+where
+    T: 'static,
+{
+    /// Returns the builder as a non-mutable `Any` reference.
+    fn as_any(&self) -> &Any {
+        self
+    }
+
+    /// Appends data from other arrays into the builder
+    ///
+    /// This is most useful when concatenating arrays of the same type into a 
builder.
+    fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> {
+        // validate arraydata and reserve memory
+        let mut total_len = 0;
+        for array in data {
+            if array.data_type() != &self.data_type() {
+                return Err(ArrowError::InvalidArgumentError(
+                    "Cannot append data to builder if data types are different"
+                        .to_string(),
+                ));
+            }
+            if array.buffers().len() != 1 {
+                return Err(ArrowError::InvalidArgumentError(
+                    "List arrays should have 1 buffer".to_string(),
+                ));
+            }
+            if array.child_data().len() != 1 {
+                return Err(ArrowError::InvalidArgumentError(
+                    "List arrays should have 1 child_data element".to_string(),
+                ));
+            }
+            total_len += array.len();
+        }
+        // reserve memory
+        self.offsets_builder.reserve(total_len)?;
+        self.bitmap_builder.reserve(total_len)?;
+        // values_builder is allocated by the relevant builder, and is not 
allocated here
+
+        // determine the latest offset on the builder
+        let mut cum_offset = if self.offsets_builder.len() == 0 {
+            0
+        } else {
+            // peek into buffer to get last appended offset
+            let buffer = self.offsets_builder.buffer.data();
+            let len = self.offsets_builder.len();
+            let (start, end) = ((len - 1) * 8, len * 8);
+            let slice = &buffer[start..end];
+            i64::from_le_bytes(slice.try_into().unwrap())
+        };
+        for array in data {
+            let len = array.len();
+            if len == 0 {
+                continue;
+            }
+            let offset = array.offset();
+
+            // `typed_data` is unsafe, however this call is safe as 
`LargeListArray` has i64 offsets
+            let offsets = unsafe {
+                &array.buffers()[0].typed_data::<i64>()[offset..(len + offset) 
+ 1]
+            };
+            // the offsets of the child array determine its length
+            // this could be obtained by getting the concrete ListArray and 
getting value_offsets
+            let offset_at_len = offsets[offsets.len() - 1] as usize;
+            let first_offset = offsets[0] as usize;
+            // create the child array and offset it
+            let child_data = &array.child_data()[0];
+            let child_array = make_array(child_data.clone());
+            // slice the child array to account for offsets
+            let sliced = child_array.slice(first_offset, offset_at_len - 
first_offset);
+            self.values().append_data(&[sliced.data()])?;
+            let adjusted_offsets: Vec<i64> = offsets
+                .windows(2)
+                .into_iter()
+                .map(|w| {
+                    let curr_offset = w[1] - w[0] + cum_offset;
+                    cum_offset = curr_offset;
+                    curr_offset
+                })
+                .collect();
+            self.offsets_builder
+                .append_slice(adjusted_offsets.as_slice())?;
+
+            for i in 0..len {
+                // account for offset as `ArrayData` does not
+                self.bitmap_builder.append(array.is_valid(offset + i))?;
+            }
+        }
+
+        // append array length
+        self.len += total_len;
+        Ok(())
+    }
+
+    /// Returns the data type of the builder
+    ///
+    /// This is used for validating array data types in `append_data`
+    fn data_type(&self) -> DataType {
+        DataType::LargeList(Box::new(self.values_builder.data_type()))
+    }
+
+    /// Returns the builder as a mutable `Any` reference.
+    fn as_any_mut(&mut self) -> &mut Any {
+        self
+    }
+
+    /// Returns the boxed builder as a box of `Any`.
+    fn into_box_any(self: Box<Self>) -> Box<Any> {
+        self
+    }
+
+    /// Returns the number of array slots in the builder
+    fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Builds the array and reset this builder.
+    fn finish(&mut self) -> ArrayRef {
+        Arc::new(self.finish())
+    }
+}
+
+impl<T: ArrayBuilder> LargeListBuilder<T>
+where
+    T: 'static,
+{
+    /// Returns the child array builder as a mutable reference.
+    ///
+    /// This mutable reference can be used to append values into the child 
array builder,
+    /// but you must call `append` to delimit each distinct list value.
+    pub fn values(&mut self) -> &mut T {
+        &mut self.values_builder
+    }
+
+    /// Finish the current variable-length list array slot
+    pub fn append(&mut self, is_valid: bool) -> Result<()> {
+        self.offsets_builder
+            .append(self.values_builder.len() as i64)?;
+        self.bitmap_builder.append(is_valid)?;
+        self.len += 1;
+        Ok(())
+    }
+
+    /// Builds the `LargeListArray` and reset this builder.
+    pub fn finish(&mut self) -> LargeListArray {
+        let len = self.len();
+        self.len = 0;
+        let values_arr = self
+            .values_builder
+            .as_any_mut()
+            .downcast_mut::<T>()
+            .unwrap()
+            .finish();
+        let values_data = values_arr.data();
+
+        let offset_buffer = self.offsets_builder.finish();
+        let null_bit_buffer = self.bitmap_builder.finish();
+        self.offsets_builder.append(0).unwrap();
+        let data = ArrayData::builder(DataType::LargeList(Box::new(
+            values_data.data_type().clone(),
+        )))
+        .len(len)
+        .null_count(len - bit_util::count_set_bits(null_bit_buffer.data()))
+        .add_buffer(offset_buffer)
+        .add_child_data(values_data)
+        .null_bit_buffer(null_bit_buffer)
+        .build();
+
+        LargeListArray::from(data)
+    }
+}
+
+///  Array builder for `ListArray`
 pub struct FixedSizeListBuilder<T: ArrayBuilder> {
     bitmap_builder: BooleanBufferBuilder,
     values_builder: T,
@@ -1007,10 +1208,18 @@ pub struct BinaryBuilder {
     builder: ListBuilder<UInt8Builder>,
 }
 
+pub struct LargeBinaryBuilder {
+    builder: LargeListBuilder<UInt8Builder>,
+}
+
 pub struct StringBuilder {
     builder: ListBuilder<UInt8Builder>,
 }
 
+pub struct LargeStringBuilder {
+    builder: LargeListBuilder<UInt8Builder>,
+}
+
 pub struct FixedSizeBinaryBuilder {
     builder: FixedSizeListBuilder<UInt8Builder>,
 }
@@ -1019,6 +1228,8 @@ pub trait BinaryArrayBuilder: ArrayBuilder {}
 
 impl BinaryArrayBuilder for BinaryBuilder {}
 impl BinaryArrayBuilder for StringBuilder {}
+impl BinaryArrayBuilder for LargeStringBuilder {}
+impl BinaryArrayBuilder for LargeBinaryBuilder {}
 impl BinaryArrayBuilder for FixedSizeBinaryBuilder {}
 
 impl ArrayBuilder for BinaryBuilder {
@@ -1062,6 +1273,47 @@ impl ArrayBuilder for BinaryBuilder {
     }
 }
 
+impl ArrayBuilder for LargeBinaryBuilder {
+    /// Returns the builder as a non-mutable `Any` reference.
+    fn as_any(&self) -> &Any {
+        self
+    }
+
+    /// Returns the builder as a mutable `Any` reference.
+    fn as_any_mut(&mut self) -> &mut Any {
+        self
+    }
+
+    /// Appends data from other arrays into the builder
+    ///
+    /// This is most useful when concatenating arrays of the same type into a 
builder.
+    fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> {
+        append_large_binary_data(&mut self.builder, &DataType::LargeBinary, 
data)
+    }
+
+    /// Returns the data type of the builder
+    ///
+    /// This is used for validating array data types in `append_data`
+    fn data_type(&self) -> DataType {
+        DataType::LargeBinary
+    }
+
+    /// Returns the boxed builder as a box of `Any`.
+    fn into_box_any(self: Box<Self>) -> Box<Any> {
+        self
+    }
+
+    /// Returns the number of array slots in the builder
+    fn len(&self) -> usize {
+        self.builder.len()
+    }
+
+    /// Builds the array and reset this builder.
+    fn finish(&mut self) -> ArrayRef {
+        Arc::new(self.finish())
+    }
+}
+
 impl ArrayBuilder for StringBuilder {
     /// Returns the builder as a non-mutable `Any` reference.
     fn as_any(&self) -> &Any {
@@ -1155,6 +1407,99 @@ fn append_binary_data(
     Ok(())
 }
 
+// Helper function for appending LargeBinary and LargeUtf8 data
+fn append_large_binary_data(
+    builder: &mut LargeListBuilder<UInt8Builder>,
+    data_type: &DataType,
+    data: &[ArrayDataRef],
+) -> Result<()> {
+    // validate arraydata and reserve memory
+    for array in data {
+        if array.data_type() != data_type {
+            return Err(ArrowError::InvalidArgumentError(
+                "Cannot append data to builder if data types are 
different".to_string(),
+            ));
+        }
+        if array.buffers().len() != 2 {
+            return Err(ArrowError::InvalidArgumentError(
+                "Binary/String arrays should have 2 buffers".to_string(),
+            ));
+        }
+    }
+
+    builder.append_data(
+        &data
+            .iter()
+            .map(|array| {
+                // convert string to List<u8> to reuse list's cast
+                let int_data = &array.buffers()[1];
+                let int_data = Arc::new(ArrayData::new(
+                    DataType::UInt8,
+                    int_data.len(),
+                    None,
+                    None,
+                    0,
+                    vec![int_data.clone()],
+                    vec![],
+                )) as ArrayDataRef;
+
+                Arc::new(ArrayData::new(
+                    DataType::LargeList(Box::new(DataType::UInt8)),
+                    array.len(),
+                    None,
+                    array.null_buffer().map(|buf| buf.clone()),
+                    array.offset(),
+                    vec![(&array.buffers()[0]).clone()],
+                    vec![int_data],
+                ))
+            })
+            .collect::<Vec<ArrayDataRef>>(),
+    )?;
+
+    Ok(())
+}
+
+impl ArrayBuilder for LargeStringBuilder {
+    /// Returns the builder as a non-mutable `Any` reference.
+    fn as_any(&self) -> &Any {
+        self
+    }
+
+    /// Returns the builder as a mutable `Any` reference.
+    fn as_any_mut(&mut self) -> &mut Any {
+        self
+    }
+
+    /// Appends data from other arrays into the builder
+    ///
+    /// This is most useful when concatenating arrays of the same type into a 
builder.
+    fn append_data(&mut self, data: &[ArrayDataRef]) -> Result<()> {
+        append_large_binary_data(&mut self.builder, &DataType::LargeUtf8, data)
+    }
+
+    /// Returns the data type of the builder
+    ///
+    /// This is used for validating array data types in `append_data`
+    fn data_type(&self) -> DataType {
+        DataType::LargeUtf8
+    }
+
+    /// Returns the boxed builder as a box of `Any`.
+    fn into_box_any(self: Box<Self>) -> Box<Any> {
+        self
+    }
+
+    /// Returns the number of array slots in the builder
+    fn len(&self) -> usize {
+        self.builder.len()
+    }
+
+    /// Builds the array and reset this builder.
+    fn finish(&mut self) -> ArrayRef {
+        Arc::new(self.finish())
+    }
+}
+
 impl ArrayBuilder for FixedSizeBinaryBuilder {
     /// Returns the builder as a non-mutable `Any` reference.
     fn as_any(&self) -> &Any {
@@ -1278,6 +1623,51 @@ impl BinaryBuilder {
     }
 }
 
+impl LargeBinaryBuilder {
+    /// Creates a new `LargeBinaryBuilder`, `capacity` is the number of bytes 
in the values
+    /// array
+    pub fn new(capacity: usize) -> Self {
+        let values_builder = UInt8Builder::new(capacity);
+        Self {
+            builder: LargeListBuilder::new(values_builder),
+        }
+    }
+
+    /// Appends a single byte value into the builder's values array.
+    ///
+    /// Note, when appending individual byte values you must call `append` to 
delimit each
+    /// distinct list value.
+    pub fn append_byte(&mut self, value: u8) -> Result<()> {
+        self.builder.values().append_value(value)?;
+        Ok(())
+    }
+
+    /// Appends a byte slice into the builder.
+    ///
+    /// Automatically calls the `append` method to delimit the slice appended 
in as a
+    /// distinct array element.
+    pub fn append_value(&mut self, value: &[u8]) -> Result<()> {
+        self.builder.values().append_slice(value)?;
+        self.builder.append(true)?;
+        Ok(())
+    }
+
+    /// Finish the current variable-length list array slot.
+    pub fn append(&mut self, is_valid: bool) -> Result<()> {
+        self.builder.append(is_valid)
+    }
+
+    /// Append a null value to the array.
+    pub fn append_null(&mut self) -> Result<()> {
+        self.append(false)
+    }
+
+    /// Builds the `LargeBinaryArray` and reset this builder.
+    pub fn finish(&mut self) -> LargeBinaryArray {
+        LargeBinaryArray::from(self.builder.finish())
+    }
+}
+
 impl StringBuilder {
     /// Creates a new `StringBuilder`,
     /// `capacity` is the number of bytes of string data to pre-allocate space 
for in this builder
@@ -1324,6 +1714,52 @@ impl StringBuilder {
     }
 }
 
+impl LargeStringBuilder {
+    /// Creates a new `StringBuilder`,
+    /// `capacity` is the number of bytes of string data to pre-allocate space 
for in this builder
+    pub fn new(capacity: usize) -> Self {
+        let values_builder = UInt8Builder::new(capacity);
+        Self {
+            builder: LargeListBuilder::new(values_builder),
+        }
+    }
+
+    /// Creates a new `StringBuilder`,
+    /// `data_capacity` is the number of bytes of string data to pre-allocate 
space for in this builder
+    /// `item_capacity` is the number of items to pre-allocate space for in 
this builder
+    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+        let values_builder = UInt8Builder::new(data_capacity);
+        Self {
+            builder: LargeListBuilder::with_capacity(values_builder, 
item_capacity),
+        }
+    }
+
+    /// Appends a string into the builder.
+    ///
+    /// Automatically calls the `append` method to delimit the string appended 
in as a
+    /// distinct array element.
+    pub fn append_value(&mut self, value: &str) -> Result<()> {
+        self.builder.values().append_slice(value.as_bytes())?;
+        self.builder.append(true)?;
+        Ok(())
+    }
+
+    /// Finish the current variable-length list array slot.
+    pub fn append(&mut self, is_valid: bool) -> Result<()> {
+        self.builder.append(is_valid)
+    }
+
+    /// Append a null value to the array.
+    pub fn append_null(&mut self) -> Result<()> {
+        self.append(false)
+    }
+
+    /// Builds the `LargeStringArray` and reset this builder.
+    pub fn finish(&mut self) -> LargeStringArray {
+        LargeStringArray::from(self.builder.finish())
+    }
+}
+
 impl FixedSizeBinaryBuilder {
     /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in 
the values
     /// array
@@ -2313,6 +2749,45 @@ mod tests {
     }
 
     #[test]
+    fn test_large_list_array_builder() {
+        let values_builder = Int32Builder::new(10);
+        let mut builder = LargeListBuilder::new(values_builder);
+
+        //  [[0, 1, 2], [3, 4, 5], [6, 7]]
+        builder.values().append_value(0).unwrap();
+        builder.values().append_value(1).unwrap();
+        builder.values().append_value(2).unwrap();
+        builder.append(true).unwrap();
+        builder.values().append_value(3).unwrap();
+        builder.values().append_value(4).unwrap();
+        builder.values().append_value(5).unwrap();
+        builder.append(true).unwrap();
+        builder.values().append_value(6).unwrap();
+        builder.values().append_value(7).unwrap();
+        builder.append(true).unwrap();
+        let list_array = builder.finish();
+
+        let values = list_array.values().data().buffers()[0].clone();
+        assert_eq!(
+            Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()),
+            values
+        );
+        assert_eq!(
+            Buffer::from(&[0i64, 3, 6, 8].to_byte_slice()),
+            list_array.data().buffers()[0].clone()
+        );
+        assert_eq!(DataType::Int32, list_array.value_type());
+        assert_eq!(3, list_array.len());
+        assert_eq!(0, list_array.null_count());
+        assert_eq!(6, list_array.value_offset(2));
+        assert_eq!(2, list_array.value_length(2));
+        for i in 0..3 {
+            assert!(list_array.is_valid(i));
+            assert!(!list_array.is_null(i));
+        }
+    }
+
+    #[test]
     fn test_list_array_builder_nulls() {
         let values_builder = Int32Builder::new(10);
         let mut builder = ListBuilder::new(values_builder);
@@ -2340,6 +2815,33 @@ mod tests {
     }
 
     #[test]
+    fn test_large_list_array_builder_nulls() {
+        let values_builder = Int32Builder::new(10);
+        let mut builder = LargeListBuilder::new(values_builder);
+
+        //  [[0, 1, 2], null, [3, null, 5], [6, 7]]
+        builder.values().append_value(0).unwrap();
+        builder.values().append_value(1).unwrap();
+        builder.values().append_value(2).unwrap();
+        builder.append(true).unwrap();
+        builder.append(false).unwrap();
+        builder.values().append_value(3).unwrap();
+        builder.values().append_null().unwrap();
+        builder.values().append_value(5).unwrap();
+        builder.append(true).unwrap();
+        builder.values().append_value(6).unwrap();
+        builder.values().append_value(7).unwrap();
+        builder.append(true).unwrap();
+        let list_array = builder.finish();
+
+        assert_eq!(DataType::Int32, list_array.value_type());
+        assert_eq!(4, list_array.len());
+        assert_eq!(1, list_array.null_count());
+        assert_eq!(3, list_array.value_offset(2));
+        assert_eq!(3, list_array.value_length(2));
+    }
+
+    #[test]
     fn test_fixed_size_list_array_builder() {
         let values_builder = Int32Builder::new(10);
         let mut builder = FixedSizeListBuilder::new(values_builder, 3);
@@ -2509,6 +3011,37 @@ mod tests {
     }
 
     #[test]
+    fn test_large_binary_array_builder() {
+        let mut builder = LargeBinaryBuilder::new(20);
+
+        builder.append_byte(b'h').unwrap();
+        builder.append_byte(b'e').unwrap();
+        builder.append_byte(b'l').unwrap();
+        builder.append_byte(b'l').unwrap();
+        builder.append_byte(b'o').unwrap();
+        builder.append(true).unwrap();
+        builder.append(true).unwrap();
+        builder.append_byte(b'w').unwrap();
+        builder.append_byte(b'o').unwrap();
+        builder.append_byte(b'r').unwrap();
+        builder.append_byte(b'l').unwrap();
+        builder.append_byte(b'd').unwrap();
+        builder.append(true).unwrap();
+
+        let array = builder.finish();
+
+        let binary_array = LargeBinaryArray::from(array);
+
+        assert_eq!(3, binary_array.len());
+        assert_eq!(0, binary_array.null_count());
+        assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
+        assert_eq!([] as [u8; 0], binary_array.value(1));
+        assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2));
+        assert_eq!(5, binary_array.value_offset(2));
+        assert_eq!(5, binary_array.value_length(2));
+    }
+
+    #[test]
     fn test_string_array_builder() {
         let mut builder = StringBuilder::new(20);
 
diff --git a/rust/arrow/src/array/equal.rs b/rust/arrow/src/array/equal.rs
index ab76c5d..02707a5 100644
--- a/rust/arrow/src/array/equal.rs
+++ b/rust/arrow/src/array/equal.rs
@@ -227,6 +227,78 @@ impl ArrayEqual for ListArray {
     }
 }
 
+impl ArrayEqual for LargeListArray {
+    fn equals(&self, other: &dyn Array) -> bool {
+        if !base_equal(&self.data(), &other.data()) {
+            return false;
+        }
+
+        let other = other.as_any().downcast_ref::<LargeListArray>().unwrap();
+
+        if !large_value_offset_equal(self, other) {
+            return false;
+        }
+
+        if !self.values().range_equals(
+            &*other.values(),
+            self.value_offset(0) as usize,
+            self.value_offset(self.len()) as usize,
+            other.value_offset(0) as usize,
+        ) {
+            return false;
+        }
+
+        true
+    }
+
+    fn range_equals(
+        &self,
+        other: &dyn Array,
+        start_idx: usize,
+        end_idx: usize,
+        other_start_idx: usize,
+    ) -> bool {
+        assert!(other_start_idx + (end_idx - start_idx) <= other.len());
+        let other = other.as_any().downcast_ref::<LargeListArray>().unwrap();
+
+        let mut j = other_start_idx;
+        for i in start_idx..end_idx {
+            let is_null = self.is_null(i);
+            let other_is_null = other.is_null(j);
+
+            if is_null != other_is_null {
+                return false;
+            }
+
+            if is_null {
+                continue;
+            }
+
+            let start_offset = self.value_offset(i) as usize;
+            let end_offset = self.value_offset(i + 1) as usize;
+            let other_start_offset = other.value_offset(j) as usize;
+            let other_end_offset = other.value_offset(j + 1) as usize;
+
+            if end_offset - start_offset != other_end_offset - 
other_start_offset {
+                return false;
+            }
+
+            if !self.values().range_equals(
+                &*other.values(),
+                start_offset,
+                end_offset,
+                other_start_offset,
+            ) {
+                return false;
+            }
+
+            j += 1;
+        }
+
+        true
+    }
+}
+
 impl<T: ArrowPrimitiveType> ArrayEqual for DictionaryArray<T> {
     fn equals(&self, other: &dyn Array) -> bool {
         self.range_equals(other, 0, self.len(), 0)
@@ -529,6 +601,214 @@ impl ArrayEqual for StringArray {
     }
 }
 
+impl ArrayEqual for LargeBinaryArray {
+    fn equals(&self, other: &dyn Array) -> bool {
+        if !base_equal(&self.data(), &other.data()) {
+            return false;
+        }
+
+        let other = other.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
+
+        if !large_value_offset_equal(self, other) {
+            return false;
+        }
+
+        // TODO: handle null & length == 0 case?
+
+        let value_buf = self.value_data();
+        let other_value_buf = other.value_data();
+        let value_data = value_buf.data();
+        let other_value_data = other_value_buf.data();
+
+        if self.null_count() == 0 {
+            // No offset in both - just do memcmp
+            if self.offset() == 0 && other.offset() == 0 {
+                let len = self.value_offset(self.len()) as usize;
+                return value_data[..len] == other_value_data[..len];
+            } else {
+                let start = self.value_offset(0) as usize;
+                let other_start = other.value_offset(0) as usize;
+                let len = (self.value_offset(self.len()) - 
self.value_offset(0)) as usize;
+                return value_data[start..(start + len)]
+                    == other_value_data[other_start..(other_start + len)];
+            }
+        } else {
+            for i in 0..self.len() {
+                if self.is_null(i) {
+                    continue;
+                }
+
+                let start = self.value_offset(i) as usize;
+                let other_start = other.value_offset(i) as usize;
+                let len = self.value_length(i) as usize;
+                if value_data[start..(start + len)]
+                    != other_value_data[other_start..(other_start + len)]
+                {
+                    return false;
+                }
+            }
+        }
+
+        true
+    }
+
+    fn range_equals(
+        &self,
+        other: &dyn Array,
+        start_idx: usize,
+        end_idx: usize,
+        other_start_idx: usize,
+    ) -> bool {
+        assert!(other_start_idx + (end_idx - start_idx) <= other.len());
+        let other = other.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
+
+        let mut j = other_start_idx;
+        for i in start_idx..end_idx {
+            let is_null = self.is_null(i);
+            let other_is_null = other.is_null(j);
+
+            if is_null != other_is_null {
+                return false;
+            }
+
+            if is_null {
+                continue;
+            }
+
+            let start_offset = self.value_offset(i) as usize;
+            let end_offset = self.value_offset(i + 1) as usize;
+            let other_start_offset = other.value_offset(j) as usize;
+            let other_end_offset = other.value_offset(j + 1) as usize;
+
+            if end_offset - start_offset != other_end_offset - 
other_start_offset {
+                return false;
+            }
+
+            let value_buf = self.value_data();
+            let other_value_buf = other.value_data();
+            let value_data = value_buf.data();
+            let other_value_data = other_value_buf.data();
+
+            if end_offset - start_offset > 0 {
+                let len = end_offset - start_offset;
+                if value_data[start_offset..(start_offset + len)]
+                    != 
other_value_data[other_start_offset..(other_start_offset + len)]
+                {
+                    return false;
+                }
+            }
+
+            j += 1;
+        }
+
+        true
+    }
+}
+
+impl ArrayEqual for LargeStringArray {
+    fn equals(&self, other: &dyn Array) -> bool {
+        if !base_equal(&self.data(), &other.data()) {
+            return false;
+        }
+
+        let other = other.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+        if !large_value_offset_equal(self, other) {
+            return false;
+        }
+
+        // TODO: handle null & length == 0 case?
+
+        let value_buf = self.value_data();
+        let other_value_buf = other.value_data();
+        let value_data = value_buf.data();
+        let other_value_data = other_value_buf.data();
+
+        if self.null_count() == 0 {
+            // No offset in both - just do memcmp
+            if self.offset() == 0 && other.offset() == 0 {
+                let len = self.value_offset(self.len()) as usize;
+                return value_data[..len] == other_value_data[..len];
+            } else {
+                let start = self.value_offset(0) as usize;
+                let other_start = other.value_offset(0) as usize;
+                let len = (self.value_offset(self.len()) - 
self.value_offset(0)) as usize;
+                return value_data[start..(start + len)]
+                    == other_value_data[other_start..(other_start + len)];
+            }
+        } else {
+            for i in 0..self.len() {
+                if self.is_null(i) {
+                    continue;
+                }
+
+                let start = self.value_offset(i) as usize;
+                let other_start = other.value_offset(i) as usize;
+                let len = self.value_length(i) as usize;
+                if value_data[start..(start + len)]
+                    != other_value_data[other_start..(other_start + len)]
+                {
+                    return false;
+                }
+            }
+        }
+
+        true
+    }
+
+    fn range_equals(
+        &self,
+        other: &dyn Array,
+        start_idx: usize,
+        end_idx: usize,
+        other_start_idx: usize,
+    ) -> bool {
+        assert!(other_start_idx + (end_idx - start_idx) <= other.len());
+        let other = other.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+        let mut j = other_start_idx;
+        for i in start_idx..end_idx {
+            let is_null = self.is_null(i);
+            let other_is_null = other.is_null(j);
+
+            if is_null != other_is_null {
+                return false;
+            }
+
+            if is_null {
+                continue;
+            }
+
+            let start_offset = self.value_offset(i) as usize;
+            let end_offset = self.value_offset(i + 1) as usize;
+            let other_start_offset = other.value_offset(j) as usize;
+            let other_end_offset = other.value_offset(j + 1) as usize;
+
+            if end_offset - start_offset != other_end_offset - 
other_start_offset {
+                return false;
+            }
+
+            let value_buf = self.value_data();
+            let other_value_buf = other.value_data();
+            let value_data = value_buf.data();
+            let other_value_data = other_value_buf.data();
+
+            if end_offset - start_offset > 0 {
+                let len = end_offset - start_offset;
+                if value_data[start_offset..(start_offset + len)]
+                    != 
other_value_data[other_start_offset..(other_start_offset + len)]
+                {
+                    return false;
+                }
+            }
+
+            j += 1;
+        }
+
+        true
+    }
+}
+
 impl ArrayEqual for FixedSizeBinaryArray {
     fn equals(&self, other: &dyn Array) -> bool {
         if !base_equal(&self.data(), &other.data()) {
@@ -799,6 +1079,28 @@ fn value_offset_equal<T: Array + ListArrayOps>(this: &T, 
other: &T) -> bool {
     true
 }
 
+// Compare if the value offsets are equal between the two list arrays
+fn large_value_offset_equal<T: Array + LargeListArrayOps>(this: &T, other: &T) 
-> bool {
+    // Check if offsets differ
+    if this.offset() == 0 && other.offset() == 0 {
+        let offset_data = &this.data_ref().buffers()[0];
+        let other_offset_data = &other.data_ref().buffers()[0];
+        return offset_data.data()[0..((this.len() + 1) * 4)]
+            == other_offset_data.data()[0..((other.len() + 1) * 4)];
+    }
+
+    // The expensive case
+    for i in 0..=this.len() {
+        if this.value_offset_at(i) - this.value_offset_at(0)
+            != other.value_offset_at(i) - other.value_offset_at(0)
+        {
+            return false;
+        }
+    }
+
+    true
+}
+
 /// Trait for comparing arrow array with json array
 pub trait JsonEqual {
     /// Checks whether arrow array equals to json array.
@@ -858,6 +1160,20 @@ impl JsonEqual for ListArray {
     }
 }
 
+impl JsonEqual for LargeListArray {
+    fn equals_json(&self, json: &[&Value]) -> bool {
+        if self.len() != json.len() {
+            return false;
+        }
+
+        (0..self.len()).all(|i| match json[i] {
+            Value::Array(v) => self.is_valid(i) && 
self.value(i).equals_json_values(v),
+            Value::Null => self.is_null(i) || self.value_length(i) == 0,
+            _ => false,
+        })
+    }
+}
+
 impl PartialEq<Value> for ListArray {
     fn eq(&self, json: &Value) -> bool {
         match json {
@@ -867,6 +1183,15 @@ impl PartialEq<Value> for ListArray {
     }
 }
 
+impl PartialEq<Value> for LargeListArray {
+    fn eq(&self, json: &Value) -> bool {
+        match json {
+            Value::Array(json_array) => self.equals_json_values(json_array),
+            _ => false,
+        }
+    }
+}
+
 impl PartialEq<ListArray> for Value {
     fn eq(&self, arrow: &ListArray) -> bool {
         match self {
@@ -876,6 +1201,15 @@ impl PartialEq<ListArray> for Value {
     }
 }
 
+impl PartialEq<LargeListArray> for Value {
+    fn eq(&self, arrow: &LargeListArray) -> bool {
+        match self {
+            Value::Array(json_array) => arrow.equals_json_values(json_array),
+            _ => false,
+        }
+    }
+}
+
 impl<T: ArrowPrimitiveType> JsonEqual for DictionaryArray<T> {
     fn equals_json(&self, json: &[&Value]) -> bool {
         self.keys().zip(json.iter()).all(|aj| match aj {
@@ -1028,6 +1362,44 @@ impl PartialEq<BinaryArray> for Value {
     }
 }
 
+impl JsonEqual for LargeBinaryArray {
+    fn equals_json(&self, json: &[&Value]) -> bool {
+        if self.len() != json.len() {
+            return false;
+        }
+
+        (0..self.len()).all(|i| match json[i] {
+            JString(s) => {
+                // binary data is sometimes hex encoded, this checks if bytes 
are equal,
+                // and if not converting to hex is attempted
+                self.is_valid(i)
+                    && (s.as_str().as_bytes() == self.value(i)
+                        || Vec::from_hex(s.as_str()) == 
Ok(self.value(i).to_vec()))
+            }
+            JNull => self.is_null(i),
+            _ => false,
+        })
+    }
+}
+
+impl PartialEq<Value> for LargeBinaryArray {
+    fn eq(&self, json: &Value) -> bool {
+        match json {
+            Value::Array(json_array) => self.equals_json_values(&json_array),
+            _ => false,
+        }
+    }
+}
+
+impl PartialEq<LargeBinaryArray> for Value {
+    fn eq(&self, arrow: &LargeBinaryArray) -> bool {
+        match self {
+            Value::Array(json_array) => arrow.equals_json_values(&json_array),
+            _ => false,
+        }
+    }
+}
+
 impl JsonEqual for StringArray {
     fn equals_json(&self, json: &[&Value]) -> bool {
         if self.len() != json.len() {
@@ -1060,6 +1432,38 @@ impl PartialEq<StringArray> for Value {
     }
 }
 
+impl JsonEqual for LargeStringArray {
+    fn equals_json(&self, json: &[&Value]) -> bool {
+        if self.len() != json.len() {
+            return false;
+        }
+
+        (0..self.len()).all(|i| match json[i] {
+            JString(s) => self.is_valid(i) && s.as_str() == self.value(i),
+            JNull => self.is_null(i),
+            _ => false,
+        })
+    }
+}
+
+impl PartialEq<Value> for LargeStringArray {
+    fn eq(&self, json: &Value) -> bool {
+        match json {
+            Value::Array(json_array) => self.equals_json_values(&json_array),
+            _ => false,
+        }
+    }
+}
+
+impl PartialEq<LargeStringArray> for Value {
+    fn eq(&self, arrow: &LargeStringArray) -> bool {
+        match self {
+            Value::Array(json_array) => arrow.equals_json_values(&json_array),
+            _ => false,
+        }
+    }
+}
+
 impl JsonEqual for FixedSizeBinaryArray {
     fn equals_json(&self, json: &[&Value]) -> bool {
         if self.len() != json.len() {
@@ -1456,6 +1860,83 @@ mod tests {
     }
 
     #[test]
+    fn test_large_string_equal() {
+        let a = LargeStringArray::from(vec!["hello", "world"]);
+        let b = LargeStringArray::from(vec!["hello", "world"]);
+        assert!(a.equals(&b));
+        assert!(b.equals(&a));
+
+        let b = LargeStringArray::from(vec!["hello", "arrow"]);
+        assert!(!a.equals(&b));
+        assert!(!b.equals(&a));
+
+        // Test the case where null_count > 0
+
+        let a = LargeStringArray::try_from(vec![
+            Some("hello"),
+            None,
+            None,
+            Some("world"),
+            None,
+            None,
+        ])
+        .unwrap();
+
+        let b = LargeStringArray::try_from(vec![
+            Some("hello"),
+            None,
+            None,
+            Some("world"),
+            None,
+            None,
+        ])
+        .unwrap();
+        assert!(a.equals(&b));
+        assert!(b.equals(&a));
+
+        let b = LargeStringArray::try_from(vec![
+            Some("hello"),
+            Some("foo"),
+            None,
+            Some("world"),
+            None,
+            None,
+        ])
+        .unwrap();
+        assert!(!a.equals(&b));
+        assert!(!b.equals(&a));
+
+        let b = LargeStringArray::try_from(vec![
+            Some("hello"),
+            None,
+            None,
+            Some("arrow"),
+            None,
+            None,
+        ])
+        .unwrap();
+        assert!(!a.equals(&b));
+        assert!(!b.equals(&a));
+
+        // Test the case where offset != 0
+
+        let a_slice = a.slice(0, 3);
+        let b_slice = b.slice(0, 3);
+        assert!(a_slice.equals(&*b_slice));
+        assert!(b_slice.equals(&*a_slice));
+
+        let a_slice = a.slice(0, 5);
+        let b_slice = b.slice(0, 5);
+        assert!(!a_slice.equals(&*b_slice));
+        assert!(!b_slice.equals(&*a_slice));
+
+        let a_slice = a.slice(4, 1);
+        let b_slice = b.slice(4, 1);
+        assert!(a_slice.equals(&*b_slice));
+        assert!(b_slice.equals(&*a_slice));
+    }
+
+    #[test]
     fn test_struct_equal() {
         let string_builder = StringBuilder::new(5);
         let int_builder = Int32Builder::new(5);
diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs
index c2bb03d..3abc142 100644
--- a/rust/arrow/src/array/mod.rs
+++ b/rust/arrow/src/array/mod.rs
@@ -105,6 +105,9 @@ pub use self::array::BinaryArray;
 pub use self::array::DictionaryArray;
 pub use self::array::FixedSizeBinaryArray;
 pub use self::array::FixedSizeListArray;
+pub use self::array::LargeBinaryArray;
+pub use self::array::LargeListArray;
+pub use self::array::LargeStringArray;
 pub use self::array::ListArray;
 pub use self::array::PrimitiveArray;
 pub use self::array::StringArray;
@@ -152,6 +155,7 @@ pub type DurationMillisecondArray = 
PrimitiveArray<DurationMillisecondType>;
 pub type DurationMicrosecondArray = PrimitiveArray<DurationMicrosecondType>;
 pub type DurationNanosecondArray = PrimitiveArray<DurationNanosecondType>;
 
+pub use self::array::LargeListArrayOps;
 pub use self::array::ListArrayOps;
 pub use self::array::PrimitiveArrayOps;
 
@@ -193,6 +197,9 @@ pub use self::builder::ArrayBuilder;
 pub use self::builder::BinaryBuilder;
 pub use self::builder::FixedSizeBinaryBuilder;
 pub use self::builder::FixedSizeListBuilder;
+pub use self::builder::LargeBinaryBuilder;
+pub use self::builder::LargeListBuilder;
+pub use self::builder::LargeStringBuilder;
 pub use self::builder::ListBuilder;
 pub use self::builder::PrimitiveBuilder;
 pub use self::builder::PrimitiveDictionaryBuilder;
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index 1bc5027..f140e23 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -116,12 +116,18 @@ pub enum DataType {
     /// Opaque binary data of fixed size.
     /// Enum parameter specifies the number of bytes per value.
     FixedSizeBinary(i32),
+    /// Opaque binary data of variable length and 64-bit offsets.
+    LargeBinary,
     /// A variable-length string in Unicode with UTF-8 encoding.
     Utf8,
+    /// A variable-length string in Unicode with UFT-8 encoding and 64-bit 
offsets.
+    LargeUtf8,
     /// A list of some logical data type with variable length.
     List(Box<DataType>),
     /// A list of some logical data type with fixed length.
     FixedSizeList(Box<DataType>, i32),
+    /// A list of some logical data type with variable length and 64-bit 
offsets.
+    LargeList(Box<DataType>),
     /// A nested datatype that contains a number of sub-fields.
     Struct(Vec<Field>),
     /// A nested datatype that can represent slots of differing types.
@@ -769,7 +775,9 @@ impl DataType {
                 Some(s) if s == "null" => Ok(DataType::Null),
                 Some(s) if s == "bool" => Ok(DataType::Boolean),
                 Some(s) if s == "binary" => Ok(DataType::Binary),
+                Some(s) if s == "largebinary" => Ok(DataType::LargeBinary),
                 Some(s) if s == "utf8" => Ok(DataType::Utf8),
+                Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8),
                 Some(s) if s == "fixedsizebinary" => {
                     // return a list with any type as its child isn't defined 
in the map
                     if let Some(Value::Number(size)) = map.get("byteWidth") {
@@ -897,6 +905,10 @@ impl DataType {
                     // return a list with any type as its child isn't defined 
in the map
                     Ok(DataType::List(Box::new(DataType::Boolean)))
                 }
+                Some(s) if s == "largelist" => {
+                    // return a largelist with any type as its child isn't 
defined in the map
+                    Ok(DataType::LargeList(Box::new(DataType::Boolean)))
+                }
                 Some(s) if s == "fixedsizelist" => {
                     // return a list with any type as its child isn't defined 
in the map
                     if let Some(Value::Number(size)) = map.get("listSize") {
@@ -943,13 +955,16 @@ impl DataType {
             DataType::Float32 => json!({"name": "floatingpoint", "precision": 
"SINGLE"}),
             DataType::Float64 => json!({"name": "floatingpoint", "precision": 
"DOUBLE"}),
             DataType::Utf8 => json!({"name": "utf8"}),
+            DataType::LargeUtf8 => json!({"name": "largeutf8"}),
             DataType::Binary => json!({"name": "binary"}),
+            DataType::LargeBinary => json!({"name": "largebinary"}),
             DataType::FixedSizeBinary(byte_width) => {
                 json!({"name": "fixedsizebinary", "byteWidth": byte_width})
             }
             DataType::Struct(_) => json!({"name": "struct"}),
             DataType::Union(_) => json!({"name": "union"}),
             DataType::List(_) => json!({ "name": "list"}),
+            DataType::LargeList(_) => json!({ "name": "largelist"}),
             DataType::FixedSizeList(_, length) => {
                 json!({"name":"fixedsizelist", "listSize": length})
             }
@@ -1080,18 +1095,22 @@ impl Field {
                 };
                 // if data_type is a struct or list, get its children
                 let data_type = match data_type {
-                    DataType::List(_) | DataType::FixedSizeList(_, _) => {
-                        match map.get("children") {
-                            Some(Value::Array(values)) => {
-                                if values.len() != 1 {
-                                    return Err(ArrowError::ParseError(
+                    DataType::List(_)
+                    | DataType::LargeList(_)
+                    | DataType::FixedSizeList(_, _) => match 
map.get("children") {
+                        Some(Value::Array(values)) => {
+                            if values.len() != 1 {
+                                return Err(ArrowError::ParseError(
                                     "Field 'children' must have one element 
for a list data type".to_string(),
                                 ));
-                                }
-                                match data_type {
+                            }
+                            match data_type {
                                     DataType::List(_) => 
DataType::List(Box::new(
                                         Self::from(&values[0])?.data_type,
                                     )),
+                                    DataType::LargeList(_) => 
DataType::LargeList(Box::new(
+                                        Self::from(&values[0])?.data_type,
+                                    )),
                                     DataType::FixedSizeList(_, int) => {
                                         DataType::FixedSizeList(
                                             
Box::new(Self::from(&values[0])?.data_type),
@@ -1099,22 +1118,21 @@ impl Field {
                                         )
                                     }
                                     _ => unreachable!(
-                                        "Data type should be a list or 
fixedsizelist"
+                                        "Data type should be a list, largelist 
or fixedsizelist"
                                     ),
                                 }
-                            }
-                            Some(_) => {
-                                return Err(ArrowError::ParseError(
-                                    "Field 'children' must be an 
array".to_string(),
-                                ))
-                            }
-                            None => {
-                                return Err(ArrowError::ParseError(
-                                    "Field missing 'children' 
attribute".to_string(),
-                                ));
-                            }
                         }
-                    }
+                        Some(_) => {
+                            return Err(ArrowError::ParseError(
+                                "Field 'children' must be an 
array".to_string(),
+                            ))
+                        }
+                        None => {
+                            return Err(ArrowError::ParseError(
+                                "Field missing 'children' 
attribute".to_string(),
+                            ));
+                        }
+                    },
                     DataType::Struct(mut fields) => match map.get("children") {
                         Some(Value::Array(values)) => {
                             let struct_fields: Result<Vec<Field>> =
@@ -1191,6 +1209,10 @@ impl Field {
                 let item = Field::new("item", *dtype.clone(), self.nullable);
                 vec![item.to_json()]
             }
+            DataType::LargeList(dtype) => {
+                let item = Field::new("item", *dtype.clone(), self.nullable);
+                vec![item.to_json()]
+            }
             DataType::FixedSizeList(dtype, _) => {
                 let item = Field::new("item", *dtype.clone(), self.nullable);
                 vec![item.to_json()]
@@ -1307,12 +1329,15 @@ impl Field {
             | DataType::Time64(_)
             | DataType::Duration(_)
             | DataType::Binary
+            | DataType::LargeBinary
             | DataType::Interval(_)
+            | DataType::LargeList(_)
             | DataType::List(_)
             | DataType::Dictionary(_, _)
             | DataType::FixedSizeList(_, _)
             | DataType::FixedSizeBinary(_)
-            | DataType::Utf8 => {
+            | DataType::Utf8
+            | DataType::LargeUtf8 => {
                 if self.data_type != from.data_type {
                     return Err(ArrowError::SchemaError(
                         "Fail to merge schema Field due to conflicting 
datatype"
@@ -1846,6 +1871,15 @@ mod tests {
                     123,
                     true,
                 ),
+                Field::new("c32", DataType::LargeBinary, true),
+                Field::new("c33", DataType::LargeUtf8, true),
+                Field::new(
+                    "c34",
+                    DataType::LargeList(Box::new(DataType::LargeList(Box::new(
+                        DataType::Struct(vec![]),
+                    )))),
+                    true,
+                ),
             ],
             metadata,
         );
@@ -2193,11 +2227,53 @@ mod tests {
                           "id": 123,
                           "indexType": {
                             "name": "int",
-                            "isSigned": true,
-                            "bitWidth": 32
+                            "bitWidth": 32,
+                            "isSigned": true
                           },
                           "isOrdered": true
                         }
+                    },
+                    {
+                        "name": "c32",
+                        "nullable": true,
+                        "type": {
+                          "name": "largebinary"
+                        },
+                        "children": []
+                    },
+                    {
+                        "name": "c33",
+                        "nullable": true,
+                        "type": {
+                          "name": "largeutf8"
+                        },
+                        "children": []
+                    },
+                    {
+                        "name": "c34",
+                        "nullable": true,
+                        "type": {
+                          "name": "largelist"
+                        },
+                        "children": [
+                            {
+                                "name": "item",
+                                "nullable": true,
+                                "type": {
+                                    "name": "largelist"
+                                },
+                                "children": [
+                                    {
+                                        "name": "item",
+                                        "nullable": true,
+                                        "type": {
+                                            "name": "struct"
+                                        },
+                                        "children": []
+                                    }
+                                ]
+                            }
+                        ]
                     }
                 ],
                 "metadata" : {
@@ -2301,10 +2377,12 @@ mod tests {
         let schema1 = Schema::new(vec![
             Field::new("c1", DataType::Utf8, false),
             Field::new("c2", DataType::Float64, true),
+            Field::new("c3", DataType::LargeBinary, true),
         ]);
         let schema2 = Schema::new(vec![
             Field::new("c1", DataType::Utf8, false),
             Field::new("c2", DataType::Float64, true),
+            Field::new("c3", DataType::LargeBinary, true),
         ]);
 
         assert_eq!(schema1, schema2);
diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs
index 84d6b39..d3c0732 100644
--- a/rust/arrow/src/ipc/convert.rs
+++ b/rust/arrow/src/ipc/convert.rs
@@ -211,7 +211,9 @@ pub(crate) fn get_data_type(field: ipc::Field, 
may_be_dictionary: bool) -> DataT
             }
         }
         ipc::Type::Binary => DataType::Binary,
+        ipc::Type::LargeBinary => DataType::LargeBinary,
         ipc::Type::Utf8 => DataType::Utf8,
+        ipc::Type::LargeUtf8 => DataType::LargeUtf8,
         ipc::Type::FixedSizeBinary => {
             let fsb = field.type_as_fixed_size_binary().unwrap();
             DataType::FixedSizeBinary(fsb.byteWidth())
@@ -292,6 +294,15 @@ pub(crate) fn get_data_type(field: ipc::Field, 
may_be_dictionary: bool) -> DataT
             // returning int16 for now, to test, not sure how to get data type
             DataType::List(Box::new(get_data_type(child_field, false)))
         }
+        ipc::Type::LargeList => {
+            let children = field.children().unwrap();
+            if children.len() != 1 {
+                panic!("expect a large list to have one child")
+            }
+            let child_field = children.get(0);
+            // returning int16 for now, to test, not sure how to get data type
+            DataType::LargeList(Box::new(get_data_type(child_field, false)))
+        }
         ipc::Type::FixedSizeList => {
             let children = field.children().unwrap();
             if children.len() != 1 {
@@ -401,6 +412,14 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
                 Some(children),
             )
         }
+        LargeBinary => {
+            let children = fbb.create_vector(&empty_fields[..]);
+            (
+                ipc::Type::LargeBinary,
+                ipc::LargeBinaryBuilder::new(fbb).finish().as_union_value(),
+                Some(children),
+            )
+        }
         Utf8 => {
             let children = fbb.create_vector(&empty_fields[..]);
             (
@@ -409,6 +428,14 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
                 Some(children),
             )
         }
+        LargeUtf8 => {
+            let children = fbb.create_vector(&empty_fields[..]);
+            (
+                ipc::Type::LargeUtf8,
+                ipc::LargeUtf8Builder::new(fbb).finish().as_union_value(),
+                Some(children),
+            )
+        }
         FixedSizeBinary(len) => {
             let children = fbb.create_vector(&empty_fields[..]);
             let mut builder = ipc::FixedSizeBinaryBuilder::new(fbb);
@@ -538,6 +565,27 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
                 Some(children),
             )
         }
+        LargeList(ref list_type) => {
+            let inner_types = get_fb_field_type(list_type, fbb);
+            let child = ipc::Field::create(
+                fbb,
+                &ipc::FieldArgs {
+                    name: None,
+                    nullable: false,
+                    type_type: inner_types.0,
+                    type_: Some(inner_types.1),
+                    dictionary: None,
+                    children: inner_types.2,
+                    custom_metadata: None,
+                },
+            );
+            let children = fbb.create_vector(&[child]);
+            (
+                ipc::Type::LargeList,
+                ipc::LargeListBuilder::new(fbb).finish().as_union_value(),
+                Some(children),
+            )
+        }
         FixedSizeList(ref list_type, len) => {
             let inner_types = get_fb_field_type(list_type, fbb);
             let child = ipc::Field::create(
diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs
index cb7fb4c..c4b226a 100644
--- a/rust/arrow/src/ipc/reader.rs
+++ b/rust/arrow/src/ipc/reader.rs
@@ -63,7 +63,7 @@ fn create_array(
 ) -> (ArrayRef, usize, usize) {
     use DataType::*;
     let array = match data_type {
-        Utf8 | Binary => {
+        Utf8 | Binary | LargeBinary | LargeUtf8 => {
             let array = create_primitive_array(
                 &nodes[node_index],
                 data_type,
@@ -89,7 +89,7 @@ fn create_array(
             buffer_index += 2;
             array
         }
-        List(ref list_data_type) => {
+        List(ref list_data_type) | LargeList(ref list_data_type) => {
             let list_node = &nodes[node_index];
             let list_buffers: Vec<Buffer> = buffers[buffer_index..buffer_index 
+ 2]
                 .iter()
@@ -225,7 +225,7 @@ fn create_primitive_array(
     let length = field_node.length() as usize;
     let null_count = field_node.null_count() as usize;
     let array_data = match data_type {
-        Utf8 | Binary => {
+        Utf8 | Binary | LargeBinary | LargeUtf8 => {
             // read 3 buffers
             let mut builder = ArrayData::builder(data_type.clone())
                 .len(length)
diff --git a/rust/integration-testing/src/bin/arrow-json-integration-test.rs 
b/rust/integration-testing/src/bin/arrow-json-integration-test.rs
index a83e7ed..170892f 100644
--- a/rust/integration-testing/src/bin/arrow-json-integration-test.rs
+++ b/rust/integration-testing/src/bin/arrow-json-integration-test.rs
@@ -20,11 +20,7 @@ use serde_json::Value;
 
 use arrow::util::integration_util::{ArrowJson, ArrowJsonBatch, 
ArrowJsonSchema};
 
-use arrow::array::{
-    ArrayRef, BinaryBuilder, BooleanBuilder, FixedSizeBinaryBuilder, 
Float32Builder,
-    Float64Builder, Int16Builder, Int32Builder, Int64Builder, Int8Builder, 
NullArray,
-    StringBuilder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder,
-};
+use arrow::array::*;
 use arrow::datatypes::{DataType, DateUnit, IntervalUnit, Schema};
 use arrow::error::{ArrowError, Result};
 use arrow::ipc::reader::FileReader;
@@ -334,6 +330,26 @@ fn record_batch_from_json(
                 }
                 Arc::new(b.finish())
             }
+            DataType::LargeBinary => {
+                let mut b = LargeBinaryBuilder::new(json_col.count);
+                for (is_valid, value) in json_col
+                    .validity
+                    .as_ref()
+                    .unwrap()
+                    .iter()
+                    .zip(json_col.data.unwrap())
+                {
+                    match is_valid {
+                        1 => {
+                            let v = decode(value.as_str().unwrap()).unwrap();
+                            b.append_value(&v)
+                        }
+                        _ => b.append_null(),
+                    }
+                    .unwrap();
+                }
+                Arc::new(b.finish())
+            }
             DataType::Utf8 => {
                 let mut b = StringBuilder::new(json_col.count);
                 for (is_valid, value) in json_col
@@ -351,6 +367,23 @@ fn record_batch_from_json(
                 }
                 Arc::new(b.finish())
             }
+            DataType::LargeUtf8 => {
+                let mut b = LargeStringBuilder::new(json_col.count);
+                for (is_valid, value) in json_col
+                    .validity
+                    .as_ref()
+                    .unwrap()
+                    .iter()
+                    .zip(json_col.data.unwrap())
+                {
+                    match is_valid {
+                        1 => b.append_value(value.as_str().unwrap()),
+                        _ => b.append_null(),
+                    }
+                    .unwrap();
+                }
+                Arc::new(b.finish())
+            }
             DataType::FixedSizeBinary(len) => {
                 let mut b = FixedSizeBinaryBuilder::new(json_col.count, *len);
                 for (is_valid, value) in json_col
diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs
index 4f56bbb..c31f9db 100644
--- a/rust/parquet/src/arrow/schema.rs
+++ b/rust/parquet/src/arrow/schema.rs
@@ -269,6 +269,9 @@ fn arrow_to_parquet_type(field: &Field) -> Result<Type> {
             let dict_field = Field::new(name, *value.clone(), 
field.is_nullable());
             arrow_to_parquet_type(&dict_field)
         }
+        DataType::LargeUtf8 | DataType::LargeBinary | DataType::LargeList(_) 
=> {
+            Err(ArrowError("Large arrays not supported".to_string()))
+        }
     }
 }
 /// This struct is used to group methods and data structures used to convert 
parquet

Reply via email to