This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new efd8b9254 Remove ArrayData from Array (#3880) (#4061)
efd8b9254 is described below
commit efd8b9254a7b16427c8ff0374727bb89b5a3a966
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Apr 12 20:28:23 2023 +0100
Remove ArrayData from Array (#3880) (#4061)
* Remove ArrayData from Array (#3880)
* Fix doc
* Fix pyarrow-integration-testing
* Review feedback
---
arrow-arith/src/aggregate.rs | 7 +-
arrow-array/src/array/boolean_array.rs | 82 +++++++----
arrow-array/src/array/byte_array.rs | 90 ++++++++----
arrow-array/src/array/dictionary_array.rs | 99 +++++++++----
arrow-array/src/array/fixed_size_binary_array.rs | 101 +++++++++----
arrow-array/src/array/fixed_size_list_array.rs | 149 ++++++++++---------
arrow-array/src/array/list_array.rs | 76 +++++++---
arrow-array/src/array/map_array.rs | 67 ++++++---
arrow-array/src/array/mod.rs | 119 ++++------------
arrow-array/src/array/null_array.rs | 54 ++++---
arrow-array/src/array/primitive_array.rs | 135 ++++++++++--------
arrow-array/src/array/run_array.rs | 89 ++++++++++--
arrow-array/src/array/struct_array.rs | 105 ++++++++++----
arrow-array/src/array/union_array.rs | 107 +++++++++++---
arrow-array/src/record_batch.rs | 4 +-
arrow-cast/src/cast.rs | 2 -
arrow-ipc/src/writer.rs | 173 +++++------------------
arrow-pyarrow-integration-testing/src/lib.rs | 10 +-
arrow-select/src/nullif.rs | 6 +-
arrow/src/lib.rs | 2 +-
20 files changed, 895 insertions(+), 582 deletions(-)
diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs
index a9944db13..9ed6dee51 100644
--- a/arrow-arith/src/aggregate.rs
+++ b/arrow-arith/src/aggregate.rs
@@ -1241,13 +1241,12 @@ mod tests {
.into_iter()
.collect();
let sliced_input = sliced_input.slice(4, 2);
- let sliced_input = sliced_input.as_boolean();
- assert_eq!(sliced_input, &input);
+ assert_eq!(sliced_input, input);
- let actual = min_boolean(sliced_input);
+ let actual = min_boolean(&sliced_input);
assert_eq!(actual, expected);
- let actual = max_boolean(sliced_input);
+ let actual = max_boolean(&sliced_input);
assert_eq!(actual, expected);
}
diff --git a/arrow-array/src/array/boolean_array.rs
b/arrow-array/src/array/boolean_array.rs
index dea5c07da..d03f0fd04 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -20,7 +20,7 @@ use crate::builder::BooleanBuilder;
use crate::iterator::BooleanIter;
use crate::{Array, ArrayAccessor, ArrayRef};
use arrow_buffer::{bit_util, BooleanBuffer, MutableBuffer, NullBuffer};
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;
@@ -66,8 +66,8 @@ use std::sync::Arc;
/// ```
#[derive(Clone)]
pub struct BooleanArray {
- data: ArrayData,
values: BooleanBuffer,
+ nulls: Option<NullBuffer>,
}
impl std::fmt::Debug for BooleanArray {
@@ -90,27 +90,25 @@ impl BooleanArray {
if let Some(n) = nulls.as_ref() {
assert_eq!(values.len(), n.len());
}
-
- // TODO: Don't store ArrayData inside arrays (#3880)
- let data = unsafe {
- ArrayData::builder(DataType::Boolean)
- .len(values.len())
- .offset(values.offset())
- .nulls(nulls)
- .buffers(vec![values.inner().clone()])
- .build_unchecked()
- };
- Self { data, values }
+ Self { values, nulls }
}
/// Returns the length of this array.
pub fn len(&self) -> usize {
- self.data.len()
+ self.values.len()
}
/// Returns whether this array is empty.
pub fn is_empty(&self) -> bool {
- self.data.is_empty()
+ self.values.is_empty()
+ }
+
+ /// Returns a zero-copy slice of this array with the indicated offset and
length.
+ pub fn slice(&self, offset: usize, length: usize) -> Self {
+ Self {
+ values: self.values.slice(offset, length),
+ nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
+ }
}
/// Returns a new boolean array builder
@@ -125,7 +123,7 @@ impl BooleanArray {
/// Returns the number of non null, true values within this array
pub fn true_count(&self) -> usize {
- match self.data.nulls() {
+ match self.nulls() {
Some(nulls) => {
let null_chunks = nulls.inner().bit_chunks().iter_padded();
let value_chunks = self.values().bit_chunks().iter_padded();
@@ -247,25 +245,48 @@ impl Array for BooleanArray {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &DataType::Boolean
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
- // TODO: Slice buffers directly (#3880)
- Arc::new(Self::from(self.data.slice(offset, length)))
+ Arc::new(self.slice(offset, length))
+ }
+
+ fn len(&self) -> usize {
+ self.values.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.values.is_empty()
+ }
+
+ fn offset(&self) -> usize {
+ self.values.offset()
}
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.nulls.as_ref()
+ }
+
+ fn get_buffer_memory_size(&self) -> usize {
+ let mut sum = self.values.inner().capacity();
+ if let Some(x) = &self.nulls {
+ sum += x.buffer().capacity()
+ }
+ sum
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ std::mem::size_of::<Self>() + self.get_buffer_memory_size()
}
}
@@ -324,13 +345,22 @@ impl From<ArrayData> for BooleanArray {
let values =
BooleanBuffer::new(data.buffers()[0].clone(), data.offset(),
data.len());
- Self { data, values }
+ Self {
+ values,
+ nulls: data.nulls().cloned(),
+ }
}
}
impl From<BooleanArray> for ArrayData {
fn from(array: BooleanArray) -> Self {
- array.data
+ let builder = ArrayDataBuilder::new(DataType::Boolean)
+ .len(array.values.len())
+ .offset(array.values.offset())
+ .nulls(array.nulls)
+ .buffers(vec![array.values.into_inner()]);
+
+ unsafe { builder.build_unchecked() }
}
}
diff --git a/arrow-array/src/array/byte_array.rs
b/arrow-array/src/array/byte_array.rs
index f0e43e694..e23079ef9 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -23,7 +23,7 @@ use crate::types::ByteArrayType;
use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_buffer::{NullBuffer, OffsetBuffer};
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;
@@ -39,17 +39,19 @@ use std::sync::Arc;
/// [`BinaryArray`]: crate::BinaryArray
/// [`LargeBinaryArray`]: crate::LargeBinaryArray
pub struct GenericByteArray<T: ByteArrayType> {
- data: ArrayData,
+ data_type: DataType,
value_offsets: OffsetBuffer<T::Offset>,
value_data: Buffer,
+ nulls: Option<NullBuffer>,
}
impl<T: ByteArrayType> Clone for GenericByteArray<T> {
fn clone(&self) -> Self {
Self {
- data: self.data.clone(),
+ data_type: self.data_type.clone(),
value_offsets: self.value_offsets.clone(),
value_data: self.value_data.clone(),
+ nulls: self.nulls.clone(),
}
}
}
@@ -135,7 +137,7 @@ impl<T: ByteArrayType> GenericByteArray<T> {
/// Panics if index `i` is out of bounds.
pub fn value(&self, i: usize) -> &T::Native {
assert!(
- i < self.data.len(),
+ i < self.len(),
"Trying to access an element at index {} from a {}{}Array of
length {}",
i,
T::Offset::PREFIX,
@@ -154,29 +156,33 @@ impl<T: ByteArrayType> GenericByteArray<T> {
/// Returns a zero-copy slice of this array with the indicated offset and
length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ Self {
+ data_type: self.data_type.clone(),
+ value_offsets: self.value_offsets.slice(offset, length),
+ value_data: self.value_data.clone(),
+ nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
+ }
}
/// Returns `GenericByteBuilder` of this byte array for mutating its
values if the underlying
/// offset and data buffers are not shared by others.
pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
let len = self.len();
- let null_bit_buffer = self.data.nulls().map(|b| b.inner().sliced());
+ let value_len =
+ T::Offset::as_usize(self.value_offsets()[len] -
self.value_offsets()[0]);
+
+ let data = self.into_data();
+ let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
let element_len = std::mem::size_of::<T::Offset>();
- let offset_buffer = self.data.buffers()[0]
- .slice_with_length(self.data.offset() * element_len, (len + 1) *
element_len);
+ let offset_buffer = data.buffers()[0]
+ .slice_with_length(data.offset() * element_len, (len + 1) *
element_len);
let element_len = std::mem::size_of::<u8>();
- let value_len =
- T::Offset::as_usize(self.value_offsets()[len] -
self.value_offsets()[0]);
- let value_buffer = self.data.buffers()[1]
- .slice_with_length(self.data.offset() * element_len, value_len *
element_len);
+ let value_buffer = data.buffers()[1]
+ .slice_with_length(data.offset() * element_len, value_len *
element_len);
- drop(self.data);
- drop(self.value_data);
- drop(self.value_offsets);
+ drop(data);
let try_mutable_null_buffer = match null_bit_buffer {
None => Ok(None),
@@ -258,24 +264,49 @@ impl<T: ByteArrayType> Array for GenericByteArray<T> {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.value_offsets.len() - 1
+ }
+
+ fn is_empty(&self) -> bool {
+ self.value_offsets.len() <= 1
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.nulls.as_ref()
+ }
+
+ fn get_buffer_memory_size(&self) -> usize {
+ let mut sum = self.value_offsets.inner().inner().capacity();
+ sum += self.value_data.capacity();
+ if let Some(x) = &self.nulls {
+ sum += x.buffer().capacity()
+ }
+ sum
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ std::mem::size_of::<Self>() + self.get_buffer_memory_size()
}
}
@@ -313,18 +344,25 @@ impl<T: ByteArrayType> From<ArrayData> for
GenericByteArray<T> {
let value_offsets = unsafe { get_offsets(&data) };
let value_data = data.buffers()[1].clone();
Self {
- data,
- // SAFETY:
- // ArrayData must be valid, and validated data type above
value_offsets,
value_data,
+ data_type: data.data_type().clone(),
+ nulls: data.nulls().cloned(),
}
}
}
impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData {
fn from(array: GenericByteArray<T>) -> Self {
- array.data
+ let len = array.len();
+
+ let offsets = array.value_offsets.into_inner().into_inner();
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(len)
+ .buffers(vec![offsets, array.value_data])
+ .nulls(array.nulls);
+
+ unsafe { builder.build_unchecked() }
}
}
diff --git a/arrow-array/src/array/dictionary_array.rs
b/arrow-array/src/array/dictionary_array.rs
index dd6213d54..f25a077a8 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -208,9 +208,7 @@ pub type UInt64DictionaryArray =
DictionaryArray<UInt64Type>;
/// assert_eq!(&array, &expected);
/// ```
pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
- /// Data of this dictionary. Note that this is _not_ compatible with the C
Data interface,
- /// as, in the current implementation, `values` below are the first child
of this struct.
- data: ArrayData,
+ data_type: DataType,
/// The keys of this dictionary. These are constructed from the
/// buffer and null bitmap of `data`. Also, note that these do
@@ -228,7 +226,7 @@ pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
impl<K: ArrowDictionaryKeyType> Clone for DictionaryArray<K> {
fn clone(&self) -> Self {
Self {
- data: self.data.clone(),
+ data_type: self.data_type.clone(),
keys: self.keys.clone(),
values: self.values.clone(),
is_ordered: self.is_ordered,
@@ -325,8 +323,12 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
/// Returns a zero-copy slice of this array with the indicated offset and
length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ Self {
+ data_type: self.data_type.clone(),
+ keys: self.keys.slice(offset, length),
+ values: self.values.clone(),
+ is_ordered: self.is_ordered,
+ }
}
/// Downcast this dictionary to a [`TypedDictionaryArray`]
@@ -390,8 +392,7 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
assert!(values.len() >= self.values.len());
let builder = self
- .data
- .clone()
+ .to_data()
.into_builder()
.data_type(DataType::Dictionary(
Box::new(K::DATA_TYPE),
@@ -419,7 +420,6 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
let key_array = self.keys().clone();
let value_array = self.values().as_primitive::<V>().clone();
- drop(self.data);
drop(self.keys);
drop(self.values);
@@ -504,20 +504,22 @@ impl<T: ArrowDictionaryKeyType> From<ArrayData> for
DictionaryArray<T> {
key_data_type
);
+ let values = make_array(data.child_data()[0].clone());
+ let data_type = data.data_type().clone();
+
// create a zero-copy of the keys' data
// SAFETY:
// ArrayData is valid and verified type above
let keys = PrimitiveArray::<T>::from(unsafe {
- data.clone()
- .into_builder()
+ data.into_builder()
.data_type(T::DATA_TYPE)
.child_data(vec![])
.build_unchecked()
});
- let values = make_array(data.child_data()[0].clone());
+
Self {
- data,
+ data_type,
keys,
values,
is_ordered: false,
@@ -530,7 +532,14 @@ impl<T: ArrowDictionaryKeyType> From<ArrayData> for
DictionaryArray<T> {
impl<T: ArrowDictionaryKeyType> From<DictionaryArray<T>> for ArrayData {
fn from(array: DictionaryArray<T>) -> Self {
- array.data
+ let builder = array
+ .keys
+ .into_data()
+ .into_builder()
+ .data_type(array.data_type)
+ .child_data(vec![array.values.to_data()]);
+
+ unsafe { builder.build_unchecked() }
}
}
@@ -594,24 +603,46 @@ impl<T: ArrowDictionaryKeyType> Array for
DictionaryArray<T> {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.keys.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.keys.is_empty()
+ }
+
+ fn offset(&self) -> usize {
+ self.keys.offset()
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.keys.nulls()
+ }
+
+ fn get_buffer_memory_size(&self) -> usize {
+ self.keys.get_buffer_memory_size() +
self.values.get_buffer_memory_size()
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ std::mem::size_of::<Self>()
+ + self.keys.get_buffer_memory_size()
+ + self.values.get_array_memory_size()
}
}
@@ -685,10 +716,6 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for
TypedDictionaryArray<'a,
self.dictionary
}
- fn data(&self) -> &ArrayData {
- &self.dictionary.data
- }
-
fn to_data(&self) -> ArrayData {
self.dictionary.to_data()
}
@@ -697,13 +724,37 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for
TypedDictionaryArray<'a,
self.dictionary.into_data()
}
+ fn data_type(&self) -> &DataType {
+ self.dictionary.data_type()
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.dictionary.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.dictionary.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.dictionary.is_empty()
+ }
+
+ fn offset(&self) -> usize {
+ self.dictionary.offset()
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
self.dictionary.nulls()
}
+
+ fn get_buffer_memory_size(&self) -> usize {
+ self.dictionary.get_buffer_memory_size()
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ self.dictionary.get_array_memory_size()
+ }
}
impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V>
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs
b/arrow-array/src/array/fixed_size_binary_array.rs
index f8d2f04de..08ce76c06 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -20,7 +20,7 @@ use crate::iterator::FixedSizeBinaryIter;
use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray};
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
@@ -51,9 +51,11 @@ use std::sync::Arc;
///
#[derive(Clone)]
pub struct FixedSizeBinaryArray {
- data: ArrayData,
+ data_type: DataType, // Must be DataType::FixedSizeBinary(value_length)
value_data: Buffer,
- length: i32,
+ nulls: Option<NullBuffer>,
+ len: usize,
+ value_length: i32,
}
impl FixedSizeBinaryArray {
@@ -62,12 +64,12 @@ impl FixedSizeBinaryArray {
/// Panics if index `i` is out of bounds.
pub fn value(&self, i: usize) -> &[u8] {
assert!(
- i < self.data.len(),
+ i < self.len(),
"Trying to access an element at index {} from a
FixedSizeBinaryArray of length {}",
i,
self.len()
);
- let offset = i + self.data.offset();
+ let offset = i + self.offset();
unsafe {
let pos = self.value_offset_at(offset);
std::slice::from_raw_parts(
@@ -81,7 +83,7 @@ impl FixedSizeBinaryArray {
/// # Safety
/// Caller is responsible for ensuring that the index is within the bounds
of the array
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
- let offset = i + self.data.offset();
+ let offset = i + self.offset();
let pos = self.value_offset_at(offset);
std::slice::from_raw_parts(
self.value_data.as_ptr().offset(pos as isize),
@@ -94,7 +96,7 @@ impl FixedSizeBinaryArray {
/// Note this doesn't do any bound checking, for performance reason.
#[inline]
pub fn value_offset(&self, i: usize) -> i32 {
- self.value_offset_at(self.data.offset() + i)
+ self.value_offset_at(self.offset() + i)
}
/// Returns the length for an element.
@@ -102,18 +104,30 @@ impl FixedSizeBinaryArray {
/// All elements have the same length as the array is a fixed size.
#[inline]
pub fn value_length(&self) -> i32 {
- self.length
+ self.value_length
}
/// Returns a clone of the value data buffer
pub fn value_data(&self) -> Buffer {
- self.data.buffers()[0].clone()
+ self.value_data.clone()
}
/// Returns a zero-copy slice of this array with the indicated offset and
length.
- pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ pub fn slice(&self, offset: usize, len: usize) -> Self {
+ assert!(
+ offset.saturating_add(len) <= self.len,
+ "the length + offset of the sliced FixedSizeBinaryArray cannot
exceed the existing length"
+ );
+
+ let size = self.value_length as usize;
+
+ Self {
+ data_type: self.data_type.clone(),
+ nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
+ value_length: self.value_length,
+ value_data: self.value_data.slice_with_length(offset * size, len *
size),
+ len,
+ }
}
/// Create an array from an iterable argument of sparse byte slices.
@@ -364,7 +378,7 @@ impl FixedSizeBinaryArray {
#[inline]
fn value_offset_at(&self, i: usize) -> i32 {
- self.length * i as i32
+ self.value_length * i as i32
}
/// constructs a new iterator
@@ -380,22 +394,33 @@ impl From<ArrayData> for FixedSizeBinaryArray {
1,
"FixedSizeBinaryArray data should contain 1 buffer only (values)"
);
- let value_data = data.buffers()[0].clone();
- let length = match data.data_type() {
+ let value_length = match data.data_type() {
DataType::FixedSizeBinary(len) => *len,
_ => panic!("Expected data type to be FixedSizeBinary"),
};
+
+ let size = value_length as usize;
+ let value_data =
+ data.buffers()[0].slice_with_length(data.offset() * size,
data.len() * size);
+
Self {
- data,
+ data_type: data.data_type().clone(),
+ nulls: data.nulls().cloned(),
+ len: data.len(),
value_data,
- length,
+ value_length,
}
}
}
impl From<FixedSizeBinaryArray> for ArrayData {
fn from(array: FixedSizeBinaryArray) -> Self {
- array.data
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(array.len)
+ .buffers(vec![array.value_data])
+ .nulls(array.nulls);
+
+ unsafe { builder.build_unchecked() }
}
}
@@ -468,24 +493,48 @@ impl Array for FixedSizeBinaryArray {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.len
+ }
+
+ fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.nulls.as_ref()
+ }
+
+ fn get_buffer_memory_size(&self) -> usize {
+ let mut sum = self.value_data.capacity();
+ if let Some(n) = &self.nulls {
+ sum += n.buffer().capacity();
+ }
+ sum
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ std::mem::size_of::<Self>() + self.get_buffer_memory_size()
}
}
@@ -566,9 +615,9 @@ mod tests {
fixed_size_binary_array.value(1)
);
assert_eq!(2, fixed_size_binary_array.len());
- assert_eq!(5, fixed_size_binary_array.value_offset(0));
+ assert_eq!(0, fixed_size_binary_array.value_offset(0));
assert_eq!(5, fixed_size_binary_array.value_length());
- assert_eq!(10, fixed_size_binary_array.value_offset(1));
+ assert_eq!(5, fixed_size_binary_array.value_offset(1));
}
#[test]
diff --git a/arrow-array/src/array/fixed_size_list_array.rs
b/arrow-array/src/array/fixed_size_list_array.rs
index a56bb017f..86adafa06 100644
--- a/arrow-array/src/array/fixed_size_list_array.rs
+++ b/arrow-array/src/array/fixed_size_list_array.rs
@@ -19,7 +19,7 @@ use crate::array::print_long_array;
use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder};
use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType};
use arrow_buffer::buffer::NullBuffer;
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;
@@ -64,9 +64,11 @@ use std::sync::Arc;
/// [crate::array::FixedSizeBinaryArray]
#[derive(Clone)]
pub struct FixedSizeListArray {
- data: ArrayData,
+ data_type: DataType, // Must be DataType::FixedSizeList(value_length)
values: ArrayRef,
- length: i32,
+ nulls: Option<NullBuffer>,
+ value_length: i32,
+ len: usize,
}
impl FixedSizeListArray {
@@ -91,7 +93,7 @@ impl FixedSizeListArray {
/// Note this doesn't do any bound checking, for performance reason.
#[inline]
pub fn value_offset(&self, i: usize) -> i32 {
- self.value_offset_at(self.data.offset() + i)
+ self.value_offset_at(i)
}
/// Returns the length for an element.
@@ -99,18 +101,29 @@ impl FixedSizeListArray {
/// All elements have the same length as the array is a fixed size.
#[inline]
pub const fn value_length(&self) -> i32 {
- self.length
+ self.value_length
}
#[inline]
const fn value_offset_at(&self, i: usize) -> i32 {
- i as i32 * self.length
+ i as i32 * self.value_length
}
/// Returns a zero-copy slice of this array with the indicated offset and
length.
- pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ pub fn slice(&self, offset: usize, len: usize) -> Self {
+ assert!(
+ offset.saturating_add(len) <= self.len,
+ "the length + offset of the sliced FixedSizeListArray cannot
exceed the existing length"
+ );
+ let size = self.value_length as usize;
+
+ Self {
+ data_type: self.data_type.clone(),
+ values: self.values.slice(offset * size, len * size),
+ nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
+ value_length: self.value_length,
+ len,
+ }
}
/// Creates a [`FixedSizeListArray`] from an iterator of primitive values
@@ -163,45 +176,35 @@ impl FixedSizeListArray {
impl From<ArrayData> for FixedSizeListArray {
fn from(data: ArrayData) -> Self {
- assert_eq!(
- data.buffers().len(),
- 0,
- "FixedSizeListArray data should not contain a buffer for value
offsets"
- );
- assert_eq!(
- data.child_data().len(),
- 1,
- "FixedSizeListArray should contain a single child array (values
array)"
- );
- let values = make_array(data.child_data()[0].clone());
- let length = match data.data_type() {
- DataType::FixedSizeList(_, len) => {
- if *len > 0 {
- // check that child data is multiple of length
- assert_eq!(
- values.len() % *len as usize,
- 0,
- "FixedSizeListArray child array length should be a
multiple of {len}"
- );
- }
-
- *len
- }
+ let value_length = match data.data_type() {
+ DataType::FixedSizeList(_, len) => *len,
_ => {
panic!("FixedSizeListArray data should contain a FixedSizeList
data type")
}
};
+
+ let size = value_length as usize;
+ let values = make_array(
+ data.child_data()[0].slice(data.offset() * size, data.len() *
size),
+ );
Self {
- data,
+ data_type: data.data_type().clone(),
values,
- length,
+ nulls: data.nulls().cloned(),
+ value_length,
+ len: data.len(),
}
}
}
impl From<FixedSizeListArray> for ArrayData {
fn from(array: FixedSizeListArray) -> Self {
- array.data
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(array.len)
+ .nulls(array.nulls)
+ .child_data(vec![array.values.to_data()]);
+
+ unsafe { builder.build_unchecked() }
}
}
@@ -210,24 +213,52 @@ impl Array for FixedSizeListArray {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.len
+ }
+
+ fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.nulls.as_ref()
+ }
+
+ fn get_buffer_memory_size(&self) -> usize {
+ let mut size = self.values.get_buffer_memory_size();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ let mut size = std::mem::size_of::<Self>() +
self.values.get_array_memory_size();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
}
}
@@ -258,7 +289,6 @@ mod tests {
use super::*;
use crate::cast::AsArray;
use crate::types::Int32Type;
- use crate::Int32Array;
use arrow_buffer::{bit_util, Buffer};
use arrow_schema::Field;
@@ -289,15 +319,7 @@ mod tests {
assert_eq!(0, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
- assert_eq!(
- 0,
- list_array
- .value(0)
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .value(0)
- );
+ assert_eq!(0,
list_array.value(0).as_primitive::<Int32Type>().value(0));
for i in 0..3 {
assert!(list_array.is_valid(i));
assert!(!list_array.is_null(i));
@@ -305,26 +327,24 @@ mod tests {
// Now test with a non-zero offset
let list_data = ArrayData::builder(list_data_type)
- .len(3)
+ .len(2)
.offset(1)
.add_child_data(value_data.clone())
.build()
.unwrap();
let list_array = FixedSizeListArray::from(list_data);
- assert_eq!(value_data, list_array.values().to_data());
+ assert_eq!(value_data.slice(3, 6), list_array.values().to_data());
assert_eq!(DataType::Int32, list_array.value_type());
- assert_eq!(3, list_array.len());
+ assert_eq!(2, list_array.len());
assert_eq!(0, list_array.null_count());
assert_eq!(3,
list_array.value(0).as_primitive::<Int32Type>().value(0));
- assert_eq!(6, list_array.value_offset(1));
+ assert_eq!(3, list_array.value_offset(1));
assert_eq!(3, list_array.value_length());
}
#[test]
- #[should_panic(
- expected = "FixedSizeListArray child array length should be a multiple
of 3"
- )]
+ #[should_panic(expected = "assertion failed: (offset + length) <=
self.len()")]
// Different error messages, so skip for now
// https://github.com/apache/arrow-rs/issues/1545
#[cfg(not(feature = "force_validate"))]
@@ -389,11 +409,10 @@ mod tests {
let sliced_array = list_array.slice(1, 4);
assert_eq!(4, sliced_array.len());
- assert_eq!(1, sliced_array.offset());
assert_eq!(2, sliced_array.null_count());
for i in 0..sliced_array.len() {
- if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+ if bit_util::get_bit(&null_bits, 1 + i) {
assert!(sliced_array.is_valid(i));
} else {
assert!(sliced_array.is_null(i));
@@ -406,12 +425,14 @@ mod tests {
.downcast_ref::<FixedSizeListArray>()
.unwrap();
assert_eq!(2, sliced_list_array.value_length());
- assert_eq!(6, sliced_list_array.value_offset(2));
- assert_eq!(8, sliced_list_array.value_offset(3));
+ assert_eq!(4, sliced_list_array.value_offset(2));
+ assert_eq!(6, sliced_list_array.value_offset(3));
}
#[test]
- #[should_panic(expected = "assertion failed: (offset + length) <=
self.len()")]
+ #[should_panic(
+ expected = "the offset of the new Buffer cannot exceed the existing
length"
+ )]
fn test_fixed_size_list_array_index_out_of_bound() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
diff --git a/arrow-array/src/array/list_array.rs
b/arrow-array/src/array/list_array.rs
index f47ea8069..8e6f84743 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -21,7 +21,7 @@ use crate::{
iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef,
ArrowPrimitiveType,
};
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer};
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field};
use num::Integer;
use std::any::Any;
@@ -52,7 +52,8 @@ impl OffsetSizeTrait for i64 {
///
/// For non generic lists, you may wish to consider using [`ListArray`] or
[`LargeListArray`]`
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
- data: ArrayData,
+ data_type: DataType,
+ nulls: Option<NullBuffer>,
values: ArrayRef,
value_offsets: OffsetBuffer<OffsetSize>,
}
@@ -60,7 +61,8 @@ pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
impl<OffsetSize: OffsetSizeTrait> Clone for GenericListArray<OffsetSize> {
fn clone(&self) -> Self {
Self {
- data: self.data.clone(),
+ data_type: self.data_type.clone(),
+ nulls: self.nulls.clone(),
values: self.values.clone(),
value_offsets: self.value_offsets.clone(),
}
@@ -144,8 +146,12 @@ impl<OffsetSize: OffsetSizeTrait>
GenericListArray<OffsetSize> {
/// Returns a zero-copy slice of this array with the indicated offset and
length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ Self {
+ data_type: self.data_type.clone(),
+ nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
+ values: self.values.clone(),
+ value_offsets: self.value_offsets.slice(offset, length),
+ }
}
/// Creates a [`GenericListArray`] from an iterator of primitive values
@@ -201,7 +207,14 @@ impl<OffsetSize: 'static + OffsetSizeTrait>
From<GenericListArray<OffsetSize>>
for ArrayData
{
fn from(array: GenericListArray<OffsetSize>) -> Self {
- array.data
+ let len = array.len();
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(len)
+ .nulls(array.nulls)
+ .buffers(vec![array.value_offsets.into_inner().into_inner()])
+ .child_data(vec![array.values.to_data()]);
+
+ unsafe { builder.build_unchecked() }
}
}
@@ -244,7 +257,8 @@ impl<OffsetSize: OffsetSizeTrait>
GenericListArray<OffsetSize> {
let value_offsets = unsafe { get_offsets(&data) };
Ok(Self {
- data,
+ data_type: data.data_type().clone(),
+ nulls: data.nulls().cloned(),
values,
value_offsets,
})
@@ -256,24 +270,54 @@ impl<OffsetSize: OffsetSizeTrait> Array for
GenericListArray<OffsetSize> {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.value_offsets.len() - 1
+ }
+
+ fn is_empty(&self) -> bool {
+ self.value_offsets.len() <= 1
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.nulls.as_ref()
+ }
+
+ fn get_buffer_memory_size(&self) -> usize {
+ let mut size = self.values.get_buffer_memory_size();
+ size += self.value_offsets.inner().inner().capacity();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ let mut size = std::mem::size_of::<Self>() +
self.values.get_array_memory_size();
+ size += self.value_offsets.inner().inner().capacity();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
}
}
@@ -649,11 +693,10 @@ mod tests {
let sliced_array = list_array.slice(1, 6);
assert_eq!(6, sliced_array.len());
- assert_eq!(1, sliced_array.offset());
assert_eq!(3, sliced_array.null_count());
for i in 0..sliced_array.len() {
- if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+ if bit_util::get_bit(&null_bits, 1 + i) {
assert!(sliced_array.is_valid(i));
} else {
assert!(sliced_array.is_null(i));
@@ -713,11 +756,10 @@ mod tests {
let sliced_array = list_array.slice(1, 6);
assert_eq!(6, sliced_array.len());
- assert_eq!(1, sliced_array.offset());
assert_eq!(3, sliced_array.null_count());
for i in 0..sliced_array.len() {
- if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+ if bit_util::get_bit(&null_bits, 1 + i) {
assert!(sliced_array.is_valid(i));
} else {
assert!(sliced_array.is_null(i));
diff --git a/arrow-array/src/array/map_array.rs
b/arrow-array/src/array/map_array.rs
index 1629532b8..18b3eb3ce 100644
--- a/arrow-array/src/array/map_array.rs
+++ b/arrow-array/src/array/map_array.rs
@@ -18,7 +18,7 @@
use crate::array::{get_offsets, print_long_array};
use crate::{make_array, Array, ArrayRef, ListArray, StringArray, StructArray};
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer,
ToByteSlice};
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field};
use std::any::Any;
use std::sync::Arc;
@@ -30,7 +30,8 @@ use std::sync::Arc;
/// [StructArray] with 2 child fields.
#[derive(Clone)]
pub struct MapArray {
- data: ArrayData,
+ data_type: DataType,
+ nulls: Option<NullBuffer>,
/// The [`StructArray`] that is the direct child of this array
entries: ArrayRef,
/// The first child of `entries`, the "keys" of this MapArray
@@ -112,8 +113,14 @@ impl MapArray {
/// Returns a zero-copy slice of this array with the indicated offset and
length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ Self {
+ data_type: self.data_type.clone(),
+ nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
+ entries: self.entries.clone(),
+ keys: self.keys.clone(),
+ values: self.values.clone(),
+ value_offsets: self.value_offsets.slice(offset, length),
+ }
}
}
@@ -126,7 +133,14 @@ impl From<ArrayData> for MapArray {
impl From<MapArray> for ArrayData {
fn from(array: MapArray) -> Self {
- array.data
+ let len = array.len();
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(len)
+ .nulls(array.nulls)
+ .buffers(vec![array.value_offsets.into_inner().into_inner()])
+ .child_data(vec![array.entries.to_data()]);
+
+ unsafe { builder.build_unchecked() }
}
}
@@ -177,7 +191,8 @@ impl MapArray {
let value_offsets = unsafe { get_offsets(&data) };
Ok(Self {
- data,
+ data_type: data.data_type().clone(),
+ nulls: data.nulls().cloned(),
entries,
keys,
values,
@@ -229,34 +244,54 @@ impl Array for MapArray {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into_data()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.value_offsets.len() - 1
+ }
+
+ fn is_empty(&self) -> bool {
+ self.value_offsets.len() <= 1
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.nulls.as_ref()
}
- /// Returns the total number of bytes of memory occupied by the buffers
owned by this [MapArray].
fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size()
+ let mut size = self.entries.get_buffer_memory_size();
+ size += self.value_offsets.inner().inner().capacity();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
}
- /// Returns the total number of bytes of memory occupied physically by
this [MapArray].
fn get_array_memory_size(&self) -> usize {
- self.data.get_array_memory_size() + std::mem::size_of_val(self)
+ let mut size = std::mem::size_of::<Self>() +
self.entries.get_array_memory_size();
+ size += self.value_offsets.inner().inner().capacity();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
}
}
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index fa6e970b4..e6fd6828b 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -94,10 +94,6 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// ```
fn as_any(&self) -> &dyn Any;
- /// Returns a reference to the underlying data of this array
- #[deprecated(note = "Use Array::to_data or Array::into_data")]
- fn data(&self) -> &ArrayData;
-
/// Returns the underlying data of this array
fn to_data(&self) -> ArrayData;
@@ -106,13 +102,6 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// Unlike [`Array::to_data`] this consumes self, allowing it avoid
unnecessary clones
fn into_data(self) -> ArrayData;
- /// Returns a reference-counted pointer to the underlying data of this
array.
- #[deprecated(note = "Use Array::to_data or Array::into_data")]
- #[allow(deprecated)]
- fn data_ref(&self) -> &ArrayData {
- self.data()
- }
-
/// Returns a reference to the [`DataType`](arrow_schema::DataType) of
this array.
///
/// # Example:
@@ -125,10 +114,7 @@ pub trait Array: std::fmt::Debug + Send + Sync {
///
/// assert_eq!(*array.data_type(), DataType::Int32);
/// ```
- #[allow(deprecated)] // (#3880)
- fn data_type(&self) -> &DataType {
- self.data_ref().data_type()
- }
+ fn data_type(&self) -> &DataType;
/// Returns a zero-copy slice of this array with the indicated offset and
length.
///
@@ -156,10 +142,7 @@ pub trait Array: std::fmt::Debug + Send + Sync {
///
/// assert_eq!(array.len(), 5);
/// ```
- #[allow(deprecated)] // (#3880)
- fn len(&self) -> usize {
- self.data_ref().len()
- }
+ fn len(&self) -> usize;
/// Returns whether this array is empty.
///
@@ -172,10 +155,7 @@ pub trait Array: std::fmt::Debug + Send + Sync {
///
/// assert_eq!(array.is_empty(), false);
/// ```
- #[allow(deprecated)] // (#3880)
- fn is_empty(&self) -> bool {
- self.data_ref().is_empty()
- }
+ fn is_empty(&self) -> bool;
/// Returns the offset into the underlying data used by this array(-slice).
/// Note that the underlying data can be shared by many arrays.
@@ -184,19 +164,15 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// # Example:
///
/// ```
- /// use arrow_array::{Array, Int32Array};
+ /// use arrow_array::{Array, BooleanArray};
///
- /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
- /// // Make slice over the values [2, 3, 4]
+ /// let array = BooleanArray::from(vec![false, false, true, true]);
/// let array_slice = array.slice(1, 3);
///
/// assert_eq!(array.offset(), 0);
/// assert_eq!(array_slice.offset(), 1);
/// ```
- #[allow(deprecated)] // (#3880)
- fn offset(&self) -> usize {
- self.data_ref().offset()
- }
+ fn offset(&self) -> usize;
/// Returns the null buffers of this array if any
fn nulls(&self) -> Option<&NullBuffer>;
@@ -253,21 +229,12 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// Returns the total number of bytes of memory pointed to by this array.
/// The buffers store bytes in the Arrow memory format, and include the
data as well as the validity map.
- #[allow(deprecated)] // (#3880)
- fn get_buffer_memory_size(&self) -> usize {
- self.data_ref().get_buffer_memory_size()
- }
+ fn get_buffer_memory_size(&self) -> usize;
/// Returns the total number of bytes of memory occupied physically by
this array.
/// This value will always be greater than returned by
`get_buffer_memory_size()` and
/// includes the overhead of the data structures that contain the pointers
to the various buffers.
- #[allow(deprecated)] // (#3880)
- fn get_array_memory_size(&self) -> usize {
- // both data.get_array_memory_size and size_of_val(self) include
ArrayData fields,
- // to only count additional fields of this array subtract
size_of(ArrayData)
- self.data_ref().get_array_memory_size() + std::mem::size_of_val(self)
- - std::mem::size_of::<ArrayData>()
- }
+ fn get_array_memory_size(&self) -> usize;
}
/// A reference-counted reference to a generic `Array`.
@@ -279,11 +246,6 @@ impl Array for ArrayRef {
self.as_ref().as_any()
}
- #[allow(deprecated)]
- fn data(&self) -> &ArrayData {
- self.as_ref().data()
- }
-
fn to_data(&self) -> ArrayData {
self.as_ref().to_data()
}
@@ -292,11 +254,6 @@ impl Array for ArrayRef {
self.to_data()
}
- #[allow(deprecated)]
- fn data_ref(&self) -> &ArrayData {
- self.as_ref().data_ref()
- }
-
fn data_type(&self) -> &DataType {
self.as_ref().data_type()
}
@@ -347,11 +304,6 @@ impl<'a, T: Array> Array for &'a T {
T::as_any(self)
}
- #[allow(deprecated)]
- fn data(&self) -> &ArrayData {
- T::data(self)
- }
-
fn to_data(&self) -> ArrayData {
T::to_data(self)
}
@@ -360,11 +312,6 @@ impl<'a, T: Array> Array for &'a T {
self.to_data()
}
- #[allow(deprecated)]
- fn data_ref(&self) -> &ArrayData {
- T::data_ref(self)
- }
-
fn data_type(&self) -> &DataType {
T::data_type(self)
}
@@ -435,93 +382,80 @@ pub trait ArrayAccessor: Array {
}
impl PartialEq for dyn Array + '_ {
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl<T: Array> PartialEq<T> for dyn Array + '_ {
- #[allow(deprecated)]
fn eq(&self, other: &T) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl PartialEq for NullArray {
- #[allow(deprecated)]
fn eq(&self, other: &NullArray) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl<T: ArrowPrimitiveType> PartialEq for PrimitiveArray<T> {
- #[allow(deprecated)]
fn eq(&self, other: &PrimitiveArray<T>) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl<K: ArrowDictionaryKeyType> PartialEq for DictionaryArray<K> {
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl PartialEq for BooleanArray {
- #[allow(deprecated)]
fn eq(&self, other: &BooleanArray) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericStringArray<OffsetSize>
{
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericBinaryArray<OffsetSize>
{
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl PartialEq for FixedSizeBinaryArray {
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericListArray<OffsetSize> {
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl PartialEq for MapArray {
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl PartialEq for FixedSizeListArray {
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
impl PartialEq for StructArray {
- #[allow(deprecated)]
fn eq(&self, other: &Self) -> bool {
- self.data().eq(other.data())
+ self.to_data().eq(&other.to_data())
}
}
@@ -752,7 +686,7 @@ mod tests {
use super::*;
use crate::cast::{as_union_array, downcast_array};
use crate::downcast_run_array;
- use arrow_buffer::{Buffer, MutableBuffer};
+ use arrow_buffer::MutableBuffer;
use arrow_schema::{Field, Fields, UnionFields, UnionMode};
#[test]
@@ -962,13 +896,9 @@ mod tests {
assert_eq!(0, null_arr.get_buffer_memory_size());
assert_eq!(
- std::mem::size_of::<NullArray>(),
+ std::mem::size_of::<usize>(),
null_arr.get_array_memory_size()
);
- assert_eq!(
- std::mem::size_of::<NullArray>(),
- std::mem::size_of::<ArrayData>(),
- );
}
#[test]
@@ -1001,8 +931,7 @@ mod tests {
// which includes the optional validity buffer
// plus one buffer on the heap
assert_eq!(
- std::mem::size_of::<PrimitiveArray<Int64Type>>()
- + std::mem::size_of::<Buffer>(),
+ std::mem::size_of::<PrimitiveArray<Int64Type>>(),
empty_with_bitmap.get_array_memory_size()
);
diff --git a/arrow-array/src/array/null_array.rs
b/arrow-array/src/array/null_array.rs
index b5d9247a6..c7f61d91d 100644
--- a/arrow-array/src/array/null_array.rs
+++ b/arrow-array/src/array/null_array.rs
@@ -19,7 +19,7 @@
use crate::{Array, ArrayRef};
use arrow_buffer::buffer::NullBuffer;
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;
@@ -40,7 +40,7 @@ use std::sync::Arc;
/// ```
#[derive(Clone)]
pub struct NullArray {
- data: ArrayData,
+ len: usize,
}
impl NullArray {
@@ -50,15 +50,17 @@ impl NullArray {
/// other [`DataType`].
///
pub fn new(length: usize) -> Self {
- let array_data = ArrayData::builder(DataType::Null).len(length);
- let array_data = unsafe { array_data.build_unchecked() };
- NullArray::from(array_data)
+ Self { len: length }
}
/// Returns a zero-copy slice of this array with the indicated offset and
length.
- pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ pub fn slice(&self, offset: usize, len: usize) -> Self {
+ assert!(
+ offset.saturating_add(len) <= self.len,
+ "the length + offset of the sliced BooleanBuffer cannot exceed the
existing length"
+ );
+
+ Self { len }
}
}
@@ -67,22 +69,34 @@ impl Array for NullArray {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &DataType::Null
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.len
+ }
+
+ fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
None
}
@@ -104,6 +118,14 @@ impl Array for NullArray {
fn null_count(&self) -> usize {
self.len()
}
+
+ fn get_buffer_memory_size(&self) -> usize {
+ 0
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ std::mem::size_of::<Self>()
+ }
}
impl From<ArrayData> for NullArray {
@@ -122,13 +144,14 @@ impl From<ArrayData> for NullArray {
data.nulls().is_none(),
"NullArray data should not contain a null buffer, as no buffers
are required"
);
- Self { data }
+ Self { len: data.len() }
}
}
impl From<NullArray> for ArrayData {
fn from(array: NullArray) -> Self {
- array.data
+ let builder = ArrayDataBuilder::new(DataType::Null).len(array.len);
+ unsafe { builder.build_unchecked() }
}
}
@@ -158,7 +181,6 @@ mod tests {
let array2 = array1.slice(8, 16);
assert_eq!(array2.len(), 16);
assert_eq!(array2.null_count(), 16);
- assert_eq!(array2.offset(), 8);
}
#[test]
diff --git a/arrow-array/src/array/primitive_array.rs
b/arrow-array/src/array/primitive_array.rs
index 75bf85b3f..319910438 100644
--- a/arrow-array/src/array/primitive_array.rs
+++ b/arrow-array/src/array/primitive_array.rs
@@ -29,7 +29,7 @@ use arrow_buffer::{
i256, ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer,
};
use arrow_data::bit_iterator::try_for_each_valid_idx;
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime};
use half::f16;
@@ -248,17 +248,18 @@ pub use crate::types::ArrowPrimitiveType;
/// }
/// ```
pub struct PrimitiveArray<T: ArrowPrimitiveType> {
- /// Underlying ArrayData
- data: ArrayData,
+ data_type: DataType,
/// Values data
values: ScalarBuffer<T::Native>,
+ nulls: Option<NullBuffer>,
}
impl<T: ArrowPrimitiveType> Clone for PrimitiveArray<T> {
fn clone(&self) -> Self {
Self {
- data: self.data.clone(),
+ data_type: self.data_type.clone(),
values: self.values.clone(),
+ nulls: self.nulls.clone(),
}
}
}
@@ -281,16 +282,11 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
assert_eq!(values.len(), n.len());
}
- // TODO: Don't store ArrayData inside arrays (#3880)
- let data = unsafe {
- ArrayData::builder(data_type)
- .len(values.len())
- .nulls(nulls)
- .buffers(vec![values.inner().clone()])
- .build_unchecked()
- };
-
- Self { data, values }
+ Self {
+ data_type,
+ values,
+ nulls,
+ }
}
/// Asserts that `data_type` is compatible with `Self`
@@ -306,12 +302,12 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
/// Returns the length of this array.
#[inline]
pub fn len(&self) -> usize {
- self.data.len()
+ self.values.len()
}
/// Returns whether this array is empty.
pub fn is_empty(&self) -> bool {
- self.data.is_empty()
+ self.values.is_empty()
}
/// Returns the values of this array
@@ -367,18 +363,12 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
/// Creates a PrimitiveArray based on an iterator of values without nulls
pub fn from_iter_values<I: IntoIterator<Item = T::Native>>(iter: I) ->
Self {
let val_buf: Buffer = iter.into_iter().collect();
- let data = unsafe {
- ArrayData::new_unchecked(
- T::DATA_TYPE,
- val_buf.len() / std::mem::size_of::<<T as
ArrowPrimitiveType>::Native>(),
- None,
- None,
- 0,
- vec![val_buf],
- vec![],
- )
- };
- PrimitiveArray::from(data)
+ let len = val_buf.len() / std::mem::size_of::<T::Native>();
+ Self {
+ data_type: T::DATA_TYPE,
+ values: ScalarBuffer::new(val_buf, 0, len),
+ nulls: None,
+ }
}
/// Creates a PrimitiveArray based on a constant value with `count`
elements
@@ -410,8 +400,11 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
/// Returns a zero-copy slice of this array with the indicated offset and
length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ Self {
+ data_type: self.data_type.clone(),
+ values: self.values.slice(offset, length),
+ nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
+ }
}
/// Reinterprets this array's contents as a different data type without
copying
@@ -436,7 +429,7 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
where
K: ArrowPrimitiveType<Native = T::Native>,
{
- let d = self.data.clone().into_builder().data_type(K::DATA_TYPE);
+ let d = self.to_data().into_builder().data_type(K::DATA_TYPE);
// SAFETY:
// Native type is the same
@@ -629,14 +622,14 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
/// data buffer is not shared by others.
pub fn into_builder(self) -> Result<PrimitiveBuilder<T>, Self> {
let len = self.len();
- let null_bit_buffer = self.data.nulls().map(|b| b.inner().sliced());
+ let data = self.into_data();
+ let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
let element_len = std::mem::size_of::<T::Native>();
- let buffer = self.data.buffers()[0]
- .slice_with_length(self.data.offset() * element_len, len *
element_len);
+ let buffer = data.buffers()[0]
+ .slice_with_length(data.offset() * element_len, len * element_len);
- drop(self.data);
- drop(self.values);
+ drop(data);
let try_mutable_null_buffer = match null_bit_buffer {
None => Ok(None),
@@ -686,7 +679,12 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
impl<T: ArrowPrimitiveType> From<PrimitiveArray<T>> for ArrayData {
fn from(array: PrimitiveArray<T>) -> Self {
- array.data
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(array.values.len())
+ .nulls(array.nulls)
+ .buffers(vec![array.values.into_inner()]);
+
+ unsafe { builder.build_unchecked() }
}
}
@@ -695,24 +693,48 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.values.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.values.is_empty()
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.nulls.as_ref()
+ }
+
+ fn get_buffer_memory_size(&self) -> usize {
+ let mut size = self.values.inner().capacity();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ std::mem::size_of::<Self>() + self.get_buffer_memory_size()
}
}
@@ -1061,8 +1083,7 @@ impl<T: ArrowTimestampType> PrimitiveArray<T> {
/// Construct a timestamp array with an optional timezone
pub fn with_timezone_opt<S: Into<Arc<str>>>(&self, timezone: Option<S>) ->
Self {
let array_data = unsafe {
- self.data
- .clone()
+ self.to_data()
.into_builder()
.data_type(DataType::Timestamp(T::UNIT,
timezone.map(Into::into)))
.build_unchecked()
@@ -1083,7 +1104,11 @@ impl<T: ArrowPrimitiveType> From<ArrayData> for
PrimitiveArray<T> {
let values =
ScalarBuffer::new(data.buffers()[0].clone(), data.offset(),
data.len());
- Self { data, values }
+ Self {
+ data_type: data.data_type().clone(),
+ values,
+ nulls: data.nulls().cloned(),
+ }
}
}
@@ -1108,12 +1133,10 @@ impl<T: DecimalType + ArrowPrimitiveType>
PrimitiveArray<T> {
self.validate_precision_scale(precision, scale)?;
// safety: self.data is valid DataType::Decimal as checked above
- let new_data_type = T::TYPE_CONSTRUCTOR(precision, scale);
- let data = self.data.into_builder().data_type(new_data_type);
-
- // SAFETY
- // Validated data above
- Ok(unsafe { data.build_unchecked().into() })
+ Ok(Self {
+ data_type: T::TYPE_CONSTRUCTOR(precision, scale),
+ ..self
+ })
}
// validate that the new precision and scale are valid or not
@@ -1244,7 +1267,7 @@ mod tests {
fn test_primitive_array_from_vec() {
let buf = Buffer::from_slice_ref([0, 1, 2, 3, 4]);
let arr = Int32Array::from(vec![0, 1, 2, 3, 4]);
- assert_eq!(buf, *arr.data.buffers()[0]);
+ assert_eq!(&buf, arr.values.inner());
assert_eq!(5, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
@@ -1484,7 +1507,6 @@ mod tests {
let arr2 = arr.slice(2, 5);
assert_eq!(5, arr2.len());
- assert_eq!(2, arr2.offset());
assert_eq!(1, arr2.null_count());
for i in 0..arr2.len() {
@@ -1497,7 +1519,6 @@ mod tests {
let arr3 = arr2.slice(2, 3);
assert_eq!(3, arr3.len());
- assert_eq!(4, arr3.offset());
assert_eq!(0, arr3.null_count());
let int_arr3 = arr3.as_any().downcast_ref::<Int32Array>().unwrap();
@@ -1742,7 +1763,7 @@ mod tests {
fn test_primitive_array_builder() {
// Test building a primitive array with ArrayData builder and offset
let buf = Buffer::from_slice_ref([0i32, 1, 2, 3, 4, 5, 6]);
- let buf2 = buf.clone();
+ let buf2 = buf.slice_with_length(8, 20);
let data = ArrayData::builder(DataType::Int32)
.len(5)
.offset(2)
@@ -1750,7 +1771,7 @@ mod tests {
.build()
.unwrap();
let arr = Int32Array::from(data);
- assert_eq!(buf2, *arr.data.buffers()[0]);
+ assert_eq!(&buf2, arr.values.inner());
assert_eq!(5, arr.len());
assert_eq!(0, arr.null_count());
for i in 0..3 {
diff --git a/arrow-array/src/array/run_array.rs
b/arrow-array/src/array/run_array.rs
index 0754913e9..e7e71d384 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -62,7 +62,7 @@ use crate::{
/// ```
pub struct RunArray<R: RunEndIndexType> {
- data: ArrayData,
+ data_type: DataType,
run_ends: RunEndBuffer<R::Native>,
values: ArrayRef,
}
@@ -70,7 +70,7 @@ pub struct RunArray<R: RunEndIndexType> {
impl<R: RunEndIndexType> Clone for RunArray<R> {
fn clone(&self) -> Self {
Self {
- data: self.data.clone(),
+ data_type: self.data_type.clone(),
run_ends: self.run_ends.clone(),
values: self.values.clone(),
}
@@ -256,8 +256,11 @@ impl<R: RunEndIndexType> RunArray<R> {
/// Returns a zero-copy slice of this array with the indicated offset and
length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ Self {
+ data_type: self.data_type.clone(),
+ run_ends: self.run_ends.slice(offset, length),
+ values: self.values.clone(),
+ }
}
}
@@ -282,7 +285,7 @@ impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> {
let values = make_array(data.child_data()[1].clone());
Self {
- data,
+ data_type: data.data_type().clone(),
run_ends,
values,
}
@@ -291,7 +294,21 @@ impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> {
impl<R: RunEndIndexType> From<RunArray<R>> for ArrayData {
fn from(array: RunArray<R>) -> Self {
- array.data
+ let len = array.run_ends.len();
+ let offset = array.run_ends.offset();
+
+ let run_ends = ArrayDataBuilder::new(R::DATA_TYPE)
+ .len(array.run_ends.values().len())
+ .buffers(vec![array.run_ends.into_inner().into_inner()]);
+
+ let run_ends = unsafe { run_ends.build_unchecked() };
+
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(len)
+ .offset(offset)
+ .child_data(vec![run_ends, array.values.to_data()]);
+
+ unsafe { builder.build_unchecked() }
}
}
@@ -300,25 +317,47 @@ impl<T: RunEndIndexType> Array for RunArray<T> {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.run_ends.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.run_ends.is_empty()
+ }
+
+ fn offset(&self) -> usize {
+ self.run_ends.offset()
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
None
}
+
+ fn get_buffer_memory_size(&self) -> usize {
+ self.run_ends.inner().inner().capacity() +
self.values.get_buffer_memory_size()
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ std::mem::size_of::<Self>()
+ + self.run_ends.inner().inner().capacity()
+ + self.values.get_array_memory_size()
+ }
}
impl<R: RunEndIndexType> std::fmt::Debug for RunArray<R> {
@@ -497,10 +536,6 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for
TypedRunArray<'a, R, V> {
self.run_array
}
- fn data(&self) -> &ArrayData {
- &self.run_array.data
- }
-
fn to_data(&self) -> ArrayData {
self.run_array.to_data()
}
@@ -509,13 +544,37 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for
TypedRunArray<'a, R, V> {
self.run_array.into_data()
}
+ fn data_type(&self) -> &DataType {
+ self.run_array.data_type()
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.run_array.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.run_array.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.run_array.is_empty()
+ }
+
+ fn offset(&self) -> usize {
+ self.run_array.offset()
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
self.run_array.nulls()
}
+
+ fn get_buffer_memory_size(&self) -> usize {
+ self.run_array.get_buffer_memory_size()
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ self.run_array.get_array_memory_size()
+ }
}
// Array accessor converts the index of logical array to the index of the
physical array
diff --git a/arrow-array/src/array/struct_array.rs
b/arrow-array/src/array/struct_array.rs
index 1dccfc7d4..fa43062b7 100644
--- a/arrow-array/src/array/struct_array.rs
+++ b/arrow-array/src/array/struct_array.rs
@@ -17,7 +17,7 @@
use crate::{make_array, Array, ArrayRef, RecordBatch};
use arrow_buffer::{buffer_bin_or, Buffer, NullBuffer};
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field, Fields, SchemaBuilder};
use std::sync::Arc;
use std::{any::Any, ops::Index};
@@ -74,24 +74,26 @@ use std::{any::Any, ops::Index};
/// ```
#[derive(Clone)]
pub struct StructArray {
- data: ArrayData,
- pub(crate) boxed_fields: Vec<ArrayRef>,
+ len: usize,
+ data_type: DataType,
+ nulls: Option<NullBuffer>,
+ pub(crate) fields: Vec<ArrayRef>,
}
impl StructArray {
/// Returns the field at `pos`.
pub fn column(&self, pos: usize) -> &ArrayRef {
- &self.boxed_fields[pos]
+ &self.fields[pos]
}
/// Return the number of fields in this struct array
pub fn num_columns(&self) -> usize {
- self.boxed_fields.len()
+ self.fields.len()
}
/// Returns the fields of the struct array
pub fn columns(&self) -> &[ArrayRef] {
- &self.boxed_fields
+ &self.fields
}
/// Returns child array refs of the struct array
@@ -102,7 +104,7 @@ impl StructArray {
/// Return field names in this struct array
pub fn column_names(&self) -> Vec<&str> {
- match self.data.data_type() {
+ match self.data_type() {
DataType::Struct(fields) => fields
.iter()
.map(|f| f.name().as_str())
@@ -132,27 +134,48 @@ impl StructArray {
}
/// Returns a zero-copy slice of this array with the indicated offset and
length.
- pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ pub fn slice(&self, offset: usize, len: usize) -> Self {
+ assert!(
+ offset.saturating_add(len) <= self.len,
+ "the length + offset of the sliced StructArray cannot exceed the
existing length"
+ );
+
+ let fields = self.fields.iter().map(|a| a.slice(offset,
len)).collect();
+
+ Self {
+ len,
+ data_type: self.data_type.clone(),
+ nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
+ fields,
+ }
}
}
impl From<ArrayData> for StructArray {
fn from(data: ArrayData) -> Self {
- let boxed_fields = data
+ let fields = data
.child_data()
.iter()
.map(|cd| make_array(cd.clone()))
.collect();
- Self { data, boxed_fields }
+ Self {
+ len: data.len(),
+ data_type: data.data_type().clone(),
+ nulls: data.nulls().cloned(),
+ fields,
+ }
}
}
impl From<StructArray> for ArrayData {
fn from(array: StructArray) -> Self {
- array.data
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(array.len)
+ .nulls(array.nulls)
+ .child_data(array.fields.iter().map(|x| x.to_data()).collect());
+
+ unsafe { builder.build_unchecked() }
}
}
@@ -228,24 +251,53 @@ impl Array for StructArray {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.len
+ }
+
+ fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
- self.data.nulls()
+ self.nulls.as_ref()
+ }
+
+ fn get_buffer_memory_size(&self) -> usize {
+ let mut size = self.fields.iter().map(|a|
a.get_buffer_memory_size()).sum();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ let mut size = self.fields.iter().map(|a|
a.get_array_memory_size()).sum();
+ size += std::mem::size_of::<Self>();
+ if let Some(n) = self.nulls.as_ref() {
+ size += n.buffer().capacity();
+ }
+ size
}
}
@@ -343,15 +395,11 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for
StructArray {
impl From<RecordBatch> for StructArray {
fn from(value: RecordBatch) -> Self {
- // TODO: Don't store ArrayData inside arrays (#3880)
- let builder =
ArrayData::builder(DataType::Struct(value.schema().fields.clone()))
- .child_data(value.columns().iter().map(|x| x.to_data()).collect())
- .len(value.num_rows());
-
- // Safety: RecordBatch must be valid
Self {
- data: unsafe { builder.build_unchecked() },
- boxed_fields: value.columns().to_vec(),
+ len: value.num_rows(),
+ data_type: DataType::Struct(value.schema().fields().clone()),
+ nulls: None,
+ fields: value.columns().to_vec(),
}
}
}
@@ -607,7 +655,6 @@ mod tests {
let sliced_array = struct_array.slice(2, 3);
let sliced_array =
sliced_array.as_any().downcast_ref::<StructArray>().unwrap();
assert_eq!(3, sliced_array.len());
- assert_eq!(2, sliced_array.offset());
assert_eq!(1, sliced_array.null_count());
assert!(sliced_array.is_valid(0));
assert!(sliced_array.is_null(1));
@@ -616,7 +663,6 @@ mod tests {
let sliced_c0 = sliced_array.column(0);
let sliced_c0 =
sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap();
assert_eq!(3, sliced_c0.len());
- assert_eq!(2, sliced_c0.offset());
assert!(sliced_c0.is_null(0));
assert!(sliced_c0.is_null(1));
assert!(sliced_c0.is_valid(2));
@@ -625,7 +671,6 @@ mod tests {
let sliced_c1 = sliced_array.column(1);
let sliced_c1 =
sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(3, sliced_c1.len());
- assert_eq!(2, sliced_c1.offset());
assert!(sliced_c1.is_valid(0));
assert_eq!(42, sliced_c1.value(0));
assert!(sliced_c1.is_null(1));
diff --git a/arrow-array/src/array/union_array.rs
b/arrow-array/src/array/union_array.rs
index 7b818f313..172ae0821 100644
--- a/arrow-array/src/array/union_array.rs
+++ b/arrow-array/src/array/union_array.rs
@@ -18,7 +18,7 @@
use crate::{make_array, Array, ArrayRef};
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::{Buffer, ScalarBuffer};
-use arrow_data::ArrayData;
+use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field, UnionFields, UnionMode};
/// Contains the `UnionArray` type.
///
@@ -108,10 +108,10 @@ use std::sync::Arc;
/// ```
#[derive(Clone)]
pub struct UnionArray {
- data: ArrayData,
+ data_type: DataType,
type_ids: ScalarBuffer<i8>,
offsets: Option<ScalarBuffer<i32>>,
- boxed_fields: Vec<Option<ArrayRef>>,
+ fields: Vec<Option<ArrayRef>>,
}
impl UnionArray {
@@ -231,7 +231,7 @@ impl UnionArray {
/// Panics if the `type_id` provided is less than zero or greater than the
number of types
/// in the `Union`.
pub fn child(&self, type_id: i8) -> &ArrayRef {
- let boxed = &self.boxed_fields[type_id as usize];
+ let boxed = &self.fields[type_id as usize];
boxed.as_ref().expect("invalid type id")
}
@@ -279,7 +279,7 @@ impl UnionArray {
/// Returns the names of the types in the union.
pub fn type_names(&self) -> Vec<&str> {
- match self.data.data_type() {
+ match self.data_type() {
DataType::Union(fields, _) => fields
.iter()
.map(|(_, f)| f.name().as_str())
@@ -290,7 +290,7 @@ impl UnionArray {
/// Returns whether the `UnionArray` is dense (or sparse if `false`).
fn is_dense(&self) -> bool {
- match self.data.data_type() {
+ match self.data_type() {
DataType::Union(_, mode) => mode == &UnionMode::Dense,
_ => unreachable!("Union array's data type is not a union!"),
}
@@ -298,8 +298,26 @@ impl UnionArray {
/// Returns a zero-copy slice of this array with the indicated offset and
length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
- // TODO: Slice buffers directly (#3880)
- self.data.slice(offset, length).into()
+ let (offsets, fields) = match self.offsets.as_ref() {
+ // If dense union, slice offsets
+ Some(offsets) => (Some(offsets.slice(offset, length)),
self.fields.clone()),
+ // Otherwise need to slice sparse children
+ None => {
+ let fields = self
+ .fields
+ .iter()
+ .map(|x| x.as_ref().map(|x| x.slice(offset, length)))
+ .collect();
+ (None, fields)
+ }
+ };
+
+ Self {
+ data_type: self.data_type.clone(),
+ type_ids: self.type_ids.slice(offset, length),
+ offsets,
+ fields,
+ }
}
}
@@ -330,17 +348,36 @@ impl From<ArrayData> for UnionArray {
boxed_fields[field_id as usize] = Some(make_array(cd.clone()));
}
Self {
- data,
+ data_type: data.data_type().clone(),
type_ids,
offsets,
- boxed_fields,
+ fields: boxed_fields,
}
}
}
impl From<UnionArray> for ArrayData {
fn from(array: UnionArray) -> Self {
- array.data
+ let len = array.len();
+ let f = match &array.data_type {
+ DataType::Union(f, _) => f,
+ _ => unreachable!(),
+ };
+ let buffers = match array.offsets {
+ Some(o) => vec![array.type_ids.into_inner(), o.into_inner()],
+ None => vec![array.type_ids.into_inner()],
+ };
+
+ let child = f
+ .iter()
+ .map(|(i, _)| array.fields[i as usize].as_ref().unwrap().to_data())
+ .collect();
+
+ let builder = ArrayDataBuilder::new(array.data_type)
+ .len(len)
+ .buffers(buffers)
+ .child_data(child);
+ unsafe { builder.build_unchecked() }
}
}
@@ -349,22 +386,34 @@ impl Array for UnionArray {
self
}
- fn data(&self) -> &ArrayData {
- &self.data
- }
-
fn to_data(&self) -> ArrayData {
- self.data.clone()
+ self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
+ fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
+ fn len(&self) -> usize {
+ self.type_ids.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.type_ids.is_empty()
+ }
+
+ fn offset(&self) -> usize {
+ 0
+ }
+
fn nulls(&self) -> Option<&NullBuffer> {
None
}
@@ -386,6 +435,32 @@ impl Array for UnionArray {
fn null_count(&self) -> usize {
0
}
+
+ fn get_buffer_memory_size(&self) -> usize {
+ let mut sum = self.type_ids.inner().capacity();
+ if let Some(o) = self.offsets.as_ref() {
+ sum += o.inner().capacity()
+ }
+ self.fields
+ .iter()
+ .flat_map(|x| x.as_ref().map(|x| x.get_buffer_memory_size()))
+ .sum::<usize>()
+ + sum
+ }
+
+ fn get_array_memory_size(&self) -> usize {
+ let mut sum = self.type_ids.inner().capacity();
+ if let Some(o) = self.offsets.as_ref() {
+ sum += o.inner().capacity()
+ }
+ std::mem::size_of::<Self>()
+ + self
+ .fields
+ .iter()
+ .flat_map(|x| x.as_ref().map(|x| x.get_array_memory_size()))
+ .sum::<usize>()
+ + sum
+ }
}
impl std::fmt::Debug for UnionArray {
diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs
index 1350285f8..ee61d2da6 100644
--- a/arrow-array/src/record_batch.rs
+++ b/arrow-array/src/record_batch.rs
@@ -474,7 +474,7 @@ impl From<StructArray> for RecordBatch {
);
let row_count = value.len();
let schema = Arc::new(Schema::new(value.fields().clone()));
- let columns = value.boxed_fields;
+ let columns = value.fields;
RecordBatch {
schema,
@@ -614,7 +614,7 @@ mod tests {
let record_batch =
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a),
Arc::new(b)])
.unwrap();
- assert_eq!(record_batch.get_array_memory_size(), 564);
+ assert_eq!(record_batch.get_array_memory_size(), 364);
}
fn check_batch(record_batch: RecordBatch, num_rows: usize) {
diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index 05b56a0e8..2c1dae518 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -4667,10 +4667,8 @@ mod tests {
let array = Int32Array::from(vec![-5, 6, -7, 8, 100000000]);
assert_eq!(0, array.offset());
let array = array.slice(2, 3);
- assert_eq!(2, array.offset());
let b = cast(&array, &DataType::UInt8).unwrap();
assert_eq!(3, b.len());
- assert_eq!(0, b.offset());
let c = b.as_any().downcast_ref::<UInt8Array>().unwrap();
assert!(!c.is_valid(0));
assert_eq!(8, c.value(1));
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index 7d44d8f24..08ddd1812 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -24,11 +24,11 @@ use std::cmp::min;
use std::collections::HashMap;
use std::io::{BufWriter, Write};
-use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType};
use flatbuffers::FlatBufferBuilder;
use arrow_array::builder::BufferBuilder;
use arrow_array::cast::*;
+use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType};
use arrow_array::*;
use arrow_buffer::bit_util;
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
@@ -1107,94 +1107,33 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize
{
}
}
-/// Returns byte width for binary value_offset buffer spec.
-#[inline]
-fn get_value_offset_byte_width(data_type: &DataType) -> usize {
- match data_type {
- DataType::Binary | DataType::Utf8 => 4,
- DataType::LargeBinary | DataType::LargeUtf8 => 8,
- _ => unreachable!(),
+/// Returns the values and offsets [`Buffer`] for a ByteArray with offset type
`O`
+///
+/// In particular, this handles re-encoding the offsets if they don't start at
`0`,
+/// slicing the values buffer as appropriate. This helps reduce the encoded
+/// size of sliced arrays, as values that have been sliced away are not encoded
+fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer,
Buffer) {
+ if data.is_empty() {
+ return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into());
}
-}
-/// Returns the number of total bytes in base binary arrays.
-fn get_binary_buffer_len(array_data: &ArrayData) -> usize {
- if array_data.is_empty() {
- return 0;
- }
- match array_data.data_type() {
- DataType::Binary => {
- let array: BinaryArray = array_data.clone().into();
- let offsets = array.value_offsets();
- (offsets[array_data.len()] - offsets[0]) as usize
- }
- DataType::LargeBinary => {
- let array: LargeBinaryArray = array_data.clone().into();
- let offsets = array.value_offsets();
- (offsets[array_data.len()] - offsets[0]) as usize
- }
- DataType::Utf8 => {
- let array: StringArray = array_data.clone().into();
- let offsets = array.value_offsets();
- (offsets[array_data.len()] - offsets[0]) as usize
- }
- DataType::LargeUtf8 => {
- let array: LargeStringArray = array_data.clone().into();
- let offsets = array.value_offsets();
- (offsets[array_data.len()] - offsets[0]) as usize
- }
- _ => unreachable!(),
- }
-}
+ let buffers = data.buffers();
+ let offsets: &[O] = buffers[0].typed_data::<O>();
+ let offset_slice = &offsets[data.offset()..data.offset() + data.len() + 1];
-/// Rebase value offsets for given ArrayData to zero-based.
-fn get_zero_based_value_offsets<OffsetSize: OffsetSizeTrait>(
- array_data: &ArrayData,
-) -> Buffer {
- match array_data.data_type() {
- DataType::Binary | DataType::LargeBinary => {
- let array: GenericBinaryArray<OffsetSize> =
array_data.clone().into();
- let offsets = array.value_offsets();
- let start_offset = offsets[0];
-
- let mut builder =
BufferBuilder::<OffsetSize>::new(array_data.len() + 1);
- for x in offsets {
- builder.append(*x - start_offset);
- }
+ let start_offset = offset_slice.first().unwrap();
+ let end_offset = offset_slice.last().unwrap();
- builder.finish()
- }
- DataType::Utf8 | DataType::LargeUtf8 => {
- let array: GenericStringArray<OffsetSize> =
array_data.clone().into();
- let offsets = array.value_offsets();
- let start_offset = offsets[0];
-
- let mut builder =
BufferBuilder::<OffsetSize>::new(array_data.len() + 1);
- for x in offsets {
- builder.append(*x - start_offset);
- }
-
- builder.finish()
- }
- _ => unreachable!(),
- }
-}
+ let offsets = match start_offset.as_usize() {
+ 0 => buffers[0].clone(),
+ _ => offset_slice.iter().map(|x| *x - *start_offset).collect(),
+ };
-/// Returns the start offset of base binary array.
-fn get_buffer_offset<OffsetSize: OffsetSizeTrait>(array_data: &ArrayData) ->
OffsetSize {
- match array_data.data_type() {
- DataType::Binary | DataType::LargeBinary => {
- let array: GenericBinaryArray<OffsetSize> =
array_data.clone().into();
- let offsets = array.value_offsets();
- offsets[0]
- }
- DataType::Utf8 | DataType::LargeUtf8 => {
- let array: GenericStringArray<OffsetSize> =
array_data.clone().into();
- let offsets = array.value_offsets();
- offsets[0]
- }
- _ => unreachable!(),
- }
+ let values = buffers[1].slice_with_length(
+ start_offset.as_usize(),
+ end_offset.as_usize() - start_offset.as_usize(),
+ );
+ (offsets, values)
}
/// Write array data to a vector of bytes
@@ -1241,65 +1180,27 @@ fn write_array_data(
}
let data_type = array_data.data_type();
- if matches!(
- data_type,
- DataType::Binary | DataType::LargeBinary | DataType::Utf8 |
DataType::LargeUtf8
- ) {
- let offset_buffer = &array_data.buffers()[0];
- let value_offset_byte_width = get_value_offset_byte_width(data_type);
- let min_length = (array_data.len() + 1) * value_offset_byte_width;
- if buffer_need_truncate(
- array_data.offset(),
- offset_buffer,
- &BufferSpec::FixedWidth {
- byte_width: value_offset_byte_width,
- },
- min_length,
- ) {
- // Rebase offsets and truncate values
- let (new_offsets, byte_offset) =
- if matches!(data_type, DataType::Binary | DataType::Utf8) {
- (
- get_zero_based_value_offsets::<i32>(array_data),
- get_buffer_offset::<i32>(array_data) as usize,
- )
- } else {
- (
- get_zero_based_value_offsets::<i64>(array_data),
- get_buffer_offset::<i64>(array_data) as usize,
- )
- };
-
+ if matches!(data_type, DataType::Binary | DataType::Utf8) {
+ let (offsets, values) = get_byte_array_buffers::<i32>(array_data);
+ for buffer in [offsets, values] {
offset = write_buffer(
- new_offsets.as_slice(),
+ buffer.as_slice(),
buffers,
arrow_data,
offset,
compression_codec,
)?;
-
- let total_bytes = get_binary_buffer_len(array_data);
- let value_buffer = &array_data.buffers()[1];
- let buffer_length = min(total_bytes, value_buffer.len() -
byte_offset);
- let buffer_slice =
- &value_buffer.as_slice()[byte_offset..(byte_offset +
buffer_length)];
+ }
+ } else if matches!(data_type, DataType::LargeBinary | DataType::LargeUtf8)
{
+ let (offsets, values) = get_byte_array_buffers::<i64>(array_data);
+ for buffer in [offsets, values] {
offset = write_buffer(
- buffer_slice,
+ buffer.as_slice(),
buffers,
arrow_data,
offset,
compression_codec,
)?;
- } else {
- for buffer in array_data.buffers() {
- offset = write_buffer(
- buffer.as_slice(),
- buffers,
- arrow_data,
- offset,
- compression_codec,
- )?;
- }
}
} else if DataType::is_numeric(data_type)
|| DataType::is_temporal(data_type)
@@ -1445,20 +1346,20 @@ fn pad_to_8(len: u32) -> usize {
#[cfg(test)]
mod tests {
- use super::*;
-
use std::io::Cursor;
use std::io::Seek;
use std::sync::Arc;
- use crate::MetadataVersion;
-
- use crate::reader::*;
use arrow_array::builder::PrimitiveRunBuilder;
use arrow_array::builder::UnionBuilder;
use arrow_array::types::*;
use arrow_schema::DataType;
+ use crate::reader::*;
+ use crate::MetadataVersion;
+
+ use super::*;
+
#[test]
#[cfg(feature = "lz4")]
fn test_write_empty_record_batch_lz4_compression() {
diff --git a/arrow-pyarrow-integration-testing/src/lib.rs
b/arrow-pyarrow-integration-testing/src/lib.rs
index cf94b0dd4..af400868f 100644
--- a/arrow-pyarrow-integration-testing/src/lib.rs
+++ b/arrow-pyarrow-integration-testing/src/lib.rs
@@ -52,7 +52,7 @@ fn double(array: &PyAny, py: Python) -> PyResult<PyObject> {
let array = kernels::arithmetic::add(array, array).map_err(to_py_err)?;
// export
- array.data().to_pyarrow(py)
+ array.to_data().to_pyarrow(py)
}
/// calls a lambda function that receives and returns an array
@@ -64,7 +64,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult<bool> {
let expected = Arc::new(Int64Array::from(vec![Some(2), None, Some(6)])) as
ArrayRef;
// to py
- let pyarray = array.data().to_pyarrow(py)?;
+ let pyarray = array.to_data().to_pyarrow(py)?;
let pyarray = lambda.call1((pyarray,))?;
let array = make_array(ArrayData::from_pyarrow(pyarray)?);
@@ -75,7 +75,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult<bool> {
fn make_empty_array(datatype: PyArrowType<DataType>, py: Python) ->
PyResult<PyObject> {
let array = new_empty_array(&datatype.0);
- array.data().to_pyarrow(py)
+ array.to_data().to_pyarrow(py)
}
/// Returns the substring
@@ -90,7 +90,7 @@ fn substring(
// substring
let array = kernels::substring::substring(array.as_ref(), start,
None).map_err(to_py_err)?;
- Ok(array.data().to_owned().into())
+ Ok(array.to_data().into())
}
/// Returns the concatenate
@@ -101,7 +101,7 @@ fn concatenate(array: PyArrowType<ArrayData>, py: Python)
-> PyResult<PyObject>
// concat
let array = kernels::concat::concat(&[array.as_ref(),
array.as_ref()]).map_err(to_py_err)?;
- array.data().to_pyarrow(py)
+ array.to_data().to_pyarrow(py)
}
#[pyfunction]
diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs
index 6039d53ea..aaa3423d6 100644
--- a/arrow-select/src/nullif.rs
+++ b/arrow-select/src/nullif.rs
@@ -208,8 +208,7 @@ mod tests {
let s = s.slice(2, 3);
let select = select.slice(1, 3);
- let select = select.as_boolean();
- let a = nullif(&s, select).unwrap();
+ let a = nullif(&s, &select).unwrap();
let r: Vec<_> = a.as_string::<i32>().iter().collect();
assert_eq!(r, vec![None, Some("a"), None]);
}
@@ -509,9 +508,8 @@ mod tests {
.map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5)))
.collect();
let b = b.slice(b_start_offset, a_length);
- let b = b.as_boolean();
- test_nullif(&a, b);
+ test_nullif(&a, &b);
}
}
}
diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs
index 8bad29bf7..27c905ba0 100644
--- a/arrow/src/lib.rs
+++ b/arrow/src/lib.rs
@@ -322,7 +322,7 @@
//! Advanced users may wish to interact with the underlying buffers of an
[`Array`], for example,
//! for FFI or high-performance conversion from other formats. This interface
is provided by
//! [`ArrayData`] which stores the [`Buffer`] comprising an [`Array`], and can
be accessed
-//! with [`Array::data`](array::Array::data)
+//! with [`Array::to_data`](array::Array::to_data)
//!
//! The APIs for constructing [`ArrayData`] come in safe, and unsafe variants,
with the former
//! performing extensive, but potentially expensive validation to ensure the
buffers are well-formed.