This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ef1cc38bbf Cleanup list casting and support nested lists (#5113)
(#5124)
ef1cc38bbf is described below
commit ef1cc38bbfad9e596ed2ed129421f9b657445dc0
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Nov 27 10:37:02 2023 +0000
Cleanup list casting and support nested lists (#5113) (#5124)
* Cleanup list casting and support nested lists (#5113)
* Clippy
* Update can_cast_types
---
arrow-cast/src/cast.rs | 183 ++++++++++++++++++-------------------------------
1 file changed, 68 insertions(+), 115 deletions(-)
diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index dd3e271afb..22faedb96f 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -47,7 +47,7 @@ use crate::parse::{
string_to_datetime, Parser,
};
use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz,
types::*, *};
-use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer};
+use arrow_buffer::{i256, ArrowNativeType, OffsetBuffer};
use arrow_data::transform::MutableArrayData;
use arrow_data::ArrayData;
use arrow_schema::*;
@@ -124,18 +124,9 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
}
(Dictionary(_, value_type), _) => can_cast_types(value_type, to_type),
(_, Dictionary(_, value_type)) => can_cast_types(from_type,
value_type),
- (LargeList(list_from), LargeList(list_to)) => {
+ (List(list_from) | LargeList(list_from), List(list_to) |
LargeList(list_to)) => {
can_cast_types(list_from.data_type(), list_to.data_type())
}
- (List(list_from), List(list_to)) => {
- can_cast_types(list_from.data_type(), list_to.data_type())
- }
- (List(list_from), LargeList(list_to)) => {
- list_from.data_type() == list_to.data_type()
- }
- (LargeList(list_from), List(list_to)) => {
- list_from.data_type() == list_to.data_type()
- }
(List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) => {
can_cast_types(list_from.data_type(), to_type)
}
@@ -783,28 +774,10 @@ pub fn cast_with_options(
"Casting from type {from_type:?} to dictionary type
{to_type:?} not supported",
))),
},
- (List(_), List(ref to)) => cast_list_inner::<i32>(array, to, to_type,
cast_options),
- (LargeList(_), LargeList(ref to)) => {
- cast_list_inner::<i64>(array, to, to_type, cast_options)
- }
- (List(list_from), LargeList(list_to)) => {
- if list_to.data_type() != list_from.data_type() {
- Err(ArrowError::CastError(
- "cannot cast list to large-list with different child
data".into(),
- ))
- } else {
- cast_list_container::<i32, i64>(array, cast_options)
- }
- }
- (LargeList(list_from), List(list_to)) => {
- if list_to.data_type() != list_from.data_type() {
- Err(ArrowError::CastError(
- "cannot cast large-list to list with different child
data".into(),
- ))
- } else {
- cast_list_container::<i64, i32>(array, cast_options)
- }
- }
+ (List(_), List(to)) => cast_list_values::<i32>(array, to,
cast_options),
+ (LargeList(_), LargeList(to)) => cast_list_values::<i64>(array, to,
cast_options),
+ (List(_), LargeList(list_to)) => cast_list::<i32, i64>(array, list_to,
cast_options),
+ (LargeList(_), List(list_to)) => cast_list::<i64, i32>(array, list_to,
cast_options),
(List(_), FixedSizeList(field, size)) => {
let array = array.as_list::<i32>();
cast_list_to_fixed_size_list::<i32>(array, field, *size,
cast_options)
@@ -3046,28 +3019,6 @@ fn cast_values_to_list<O: OffsetSizeTrait>(
Ok(Arc::new(list))
}
-/// Helper function that takes an Generic list container and casts the inner
datatype.
-fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
- array: &dyn Array,
- to: &Field,
- to_type: &DataType,
- cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
- let data = array.to_data();
- let underlying_array = make_array(data.child_data()[0].clone());
- let cast_array = cast_with_options(underlying_array.as_ref(),
to.data_type(), cast_options)?;
- let builder = data
- .into_builder()
- .data_type(to_type.clone())
- .child_data(vec![cast_array.into_data()]);
-
- // Safety
- // Data was valid before
- let array_data = unsafe { builder.build_unchecked() };
- let list = GenericListArray::<OffsetSize>::from(array_data);
- Ok(Arc::new(list) as ArrayRef)
-}
-
/// A specified helper to cast from `GenericBinaryArray` to
`GenericStringArray` when they have same
/// offset size so re-encoding offset is unnecessary.
fn cast_binary_to_string<O: OffsetSizeTrait>(
@@ -3221,7 +3172,7 @@ where
fn cast_list_to_fixed_size_list<OffsetSize>(
array: &GenericListArray<OffsetSize>,
- field: &Arc<Field>,
+ field: &FieldRef,
size: i32,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError>
@@ -3289,75 +3240,57 @@ where
Ok(Arc::new(array))
}
-/// Cast the container type of List/Largelist array but not the inner types.
-/// This function can leave the value data intact and only has to cast the
offset dtypes.
-fn cast_list_container<OffsetSizeFrom, OffsetSizeTo>(
+/// Helper function that takes an Generic list container and casts the inner
datatype.
+fn cast_list_values<O: OffsetSizeTrait>(
array: &dyn Array,
- _cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError>
-where
- OffsetSizeFrom: OffsetSizeTrait + ToPrimitive,
- OffsetSizeTo: OffsetSizeTrait + NumCast,
-{
- let list = array.as_list::<OffsetSizeFrom>();
- // the value data stored by the list
- let values = list.values();
+ to: &FieldRef,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let list = array.as_list::<O>();
+ let values = cast_with_options(list.values(), to.data_type(),
cast_options)?;
+ Ok(Arc::new(GenericListArray::<O>::new(
+ to.clone(),
+ list.offsets().clone(),
+ values,
+ list.nulls().cloned(),
+ )))
+}
- let out_dtype = match array.data_type() {
- DataType::List(value_type) => {
- assert_eq!(
- std::mem::size_of::<OffsetSizeFrom>(),
- std::mem::size_of::<i32>()
- );
- assert_eq!(
- std::mem::size_of::<OffsetSizeTo>(),
- std::mem::size_of::<i64>()
- );
- DataType::LargeList(value_type.clone())
- }
- DataType::LargeList(value_type) => {
- assert_eq!(
- std::mem::size_of::<OffsetSizeFrom>(),
- std::mem::size_of::<i64>()
- );
- assert_eq!(
- std::mem::size_of::<OffsetSizeTo>(),
- std::mem::size_of::<i32>()
- );
- if values.len() > i32::MAX as usize {
- return Err(ArrowError::ComputeError(
- "LargeList too large to cast to List".into(),
- ));
- }
- DataType::List(value_type.clone())
- }
- // implementation error
- _ => unreachable!(),
- };
+/// Cast the container type of List/Largelist array along with the inner
datatype
+fn cast_list<I: OffsetSizeTrait, O: OffsetSizeTrait>(
+ array: &dyn Array,
+ field: &FieldRef,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let list = array.as_list::<I>();
+ let values = list.values();
+ let offsets = list.offsets();
+ let nulls = list.nulls().cloned();
- let iter = list.value_offsets().iter().map(|idx| {
- let idx: OffsetSizeTo = NumCast::from(*idx).unwrap();
- idx
- });
+ if !O::IS_LARGE && values.len() > i32::MAX as usize {
+ return Err(ArrowError::ComputeError(
+ "LargeList too large to cast to List".into(),
+ ));
+ }
- // SAFETY
- // A slice produces a trusted length iterator
- let offset_buffer = unsafe { Buffer::from_trusted_len_iter(iter) };
+ // Recursively cast values
+ let values = cast_with_options(values, field.data_type(), cast_options)?;
+ let offsets: Vec<_> = offsets.iter().map(|x|
O::usize_as(x.as_usize())).collect();
- // wrap up
- let builder = ArrayData::builder(out_dtype)
- .len(list.len())
- .add_buffer(offset_buffer)
- .add_child_data(values.to_data())
- .nulls(list.nulls().cloned());
+ // Safety: valid offsets and checked for overflow
+ let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
- let array_data = unsafe { builder.build_unchecked() };
- Ok(Arc::new(GenericListArray::<OffsetSizeTo>::from(array_data)))
+ Ok(Arc::new(GenericListArray::<O>::new(
+ field.clone(),
+ offsets,
+ values,
+ nulls,
+ )))
}
#[cfg(test)]
mod tests {
- use arrow_buffer::NullBuffer;
+ use arrow_buffer::{Buffer, NullBuffer};
use super::*;
@@ -9154,6 +9087,26 @@ mod tests {
assert_eq!(formatted.value(1).to_string(), "[[4], [null], [6]]");
}
+ #[test]
+ fn test_nested_list_cast() {
+ let mut builder =
ListBuilder::new(ListBuilder::new(Int32Builder::new()));
+ builder.append_value([Some([Some(1), Some(2), None]), None]);
+ builder.append_value([None, Some([]), None]);
+ builder.append_null();
+ builder.append_value([Some([Some(2), Some(3)])]);
+ let start = builder.finish();
+
+ let mut builder =
LargeListBuilder::new(LargeListBuilder::new(Int8Builder::new()));
+ builder.append_value([Some([Some(1), Some(2), None]), None]);
+ builder.append_value([None, Some([]), None]);
+ builder.append_null();
+ builder.append_value([Some([Some(2), Some(3)])]);
+ let expected = builder.finish();
+
+ let actual = cast(&start, expected.data_type()).unwrap();
+ assert_eq!(actual.as_ref(), &expected);
+ }
+
const CAST_OPTIONS: CastOptions<'static> = CastOptions {
safe: true,
format_options: FormatOptions::new(),