This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 90d665e ARROW-4865: [Rust] Support list casts
90d665e is described below
commit 90d665e49b071d325f99b63733930e3c62088cf3
Author: Neville Dipale <[email protected]>
AuthorDate: Fri Mar 15 07:23:08 2019 -0600
ARROW-4865: [Rust] Support list casts
This is a follow up from the initial cast kernel PR, and adds support for:
* List<Primitive> to List<Primitive>
* Primitive to List<Primitive>
The only remaining expansion to the cast kernel will be temporal casts,
then I think we'll be able to cover all cast use-cases.
Author: Neville Dipale <[email protected]>
Closes #3896 from nevi-me/ARROW-4865 and squashes the following commits:
36530ba <Neville Dipale> restrict sliced-array limitation to
primitive->list casts
fd1c49c <Neville Dipale> disable casting sliced arrays
184d0d7 <Neville Dipale> address review comments
a68e472 <Neville Dipale> fix comment
3d27160 <Neville Dipale> ARROW-4865: Support list casts
---
rust/arrow/src/array.rs | 2 +-
rust/arrow/src/compute/kernels/cast.rs | 244 +++++++++++++++++++++++++++++++--
2 files changed, 237 insertions(+), 9 deletions(-)
diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs
index d15d88b..7b692bb 100644
--- a/rust/arrow/src/array.rs
+++ b/rust/arrow/src/array.rs
@@ -131,7 +131,7 @@ pub type ArrayRef = Arc<Array>;
/// Constructs an array using the input `data`. Returns a reference-counted
`Array`
/// instance.
-fn make_array(data: ArrayDataRef) -> ArrayRef {
+pub(crate) fn make_array(data: ArrayDataRef) -> ArrayRef {
// TODO: here data_type() needs to clone the type - maybe add a type tag
enum to
// avoid the cloning.
match data.data_type().clone() {
diff --git a/rust/arrow/src/compute/kernels/cast.rs
b/rust/arrow/src/compute/kernels/cast.rs
index 3116c7e..d19097f 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-//! Defines cast kernels for `ArrayRef`, allowing casting arrays between
supported datatypes.
+//! Defines cast kernels for `ArrayRef`, allowing casting arrays between
supported
+//! datatypes.
//!
//! Example:
//!
@@ -37,6 +38,8 @@
use std::sync::Arc;
use crate::array::*;
+use crate::array_data::ArrayData;
+use crate::buffer::Buffer;
use crate::builder::*;
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
@@ -48,10 +51,12 @@ use crate::error::{ArrowError, Result};
/// * Utf8 to numeric: strings that can't be parsed to numbers return null,
float strings
/// in integer casts return null
/// * Numeric to boolean: 0 returns `false`, any other value returns `true`
+/// * List to List: the underlying data type is cast
+/// * Primitive to List: a list array with 1 value per slot is created
///
/// Unsupported Casts
/// * To or from `StructArray`
-/// * To or from `ListArray`
+/// * List to primitive
/// * Utf8 to boolean
pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
use DataType::*;
@@ -68,15 +73,60 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) ->
Result<ArrayRef> {
(_, Struct(_)) => Err(ArrowError::ComputeError(
"Cannot cast to struct from other types".to_string(),
)),
- (List(_), List(_)) => Err(ArrowError::ComputeError(
- "Casting between lists not yet supported".to_string(),
- )),
+ (List(_), List(ref to)) => {
+ let data = array.data_ref();
+ let underlying_array = make_array(data.child_data()[0].clone());
+ let cast_array = cast(&underlying_array, &to)?;
+ let array_data = ArrayData::new(
+ *to.clone(),
+ array.len(),
+ Some(cast_array.null_count()),
+ cast_array
+ .data()
+ .null_bitmap()
+ .clone()
+ .map(|bitmap| bitmap.bits),
+ array.offset(),
+ // reuse offset buffer
+ data.buffers().to_vec(),
+ vec![cast_array.data()],
+ );
+ let list = ListArray::from(Arc::new(array_data));
+ Ok(Arc::new(list) as ArrayRef)
+ }
(List(_), _) => Err(ArrowError::ComputeError(
"Cannot cast list to non-list data types".to_string(),
)),
- (_, List(_)) => Err(ArrowError::ComputeError(
- "Cannot cast primitive types to lists".to_string(),
- )),
+ (_, List(ref to)) => {
+ // see ARROW-4886 for this limitation
+ if array.offset() != 0 {
+ return Err(ArrowError::ComputeError(
+ "Cast kernel does not yet support sliced (non-zero offset)
arrays"
+ .to_string(),
+ ));
+ }
+ // cast primitive to list's primitive
+ let cast_array = cast(array, &to)?;
+ // create offsets, where if array.len() = 2, we have [0,1,2]
+ let offsets: Vec<i32> = (0..array.len() as i32 + 1).collect();
+ let value_offsets = Buffer::from(offsets[..].to_byte_slice());
+ let list_data = ArrayData::new(
+ *to.clone(),
+ array.len(),
+ Some(cast_array.null_count()),
+ cast_array
+ .data()
+ .null_bitmap()
+ .clone()
+ .map(|bitmap| bitmap.bits),
+ 0,
+ vec![value_offsets],
+ vec![cast_array.data()],
+ );
+ let list_array = Arc::new(ListArray::from(Arc::new(list_data))) as
ArrayRef;
+
+ Ok(list_array)
+ }
(_, Boolean) => match from_type {
UInt8 => cast_numeric_to_bool::<UInt8Type>(array),
UInt16 => cast_numeric_to_bool::<UInt16Type>(array),
@@ -458,6 +508,7 @@ where
#[cfg(test)]
mod tests {
use super::*;
+ use crate::buffer::Buffer;
#[test]
fn test_cast_i32_to_f64() {
@@ -487,6 +538,23 @@ mod tests {
}
#[test]
+ fn test_cast_i32_to_u8_sliced() {
+ let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]);
+ let array = Arc::new(a) as ArrayRef;
+ assert_eq!(0, array.offset());
+ let array = array.slice(2, 3);
+ assert_eq!(2, array.offset());
+ let b = cast(&array, &DataType::UInt8).unwrap();
+ assert_eq!(3, b.len());
+ assert_eq!(0, b.offset());
+ let c = b.as_any().downcast_ref::<UInt8Array>().unwrap();
+ assert_eq!(false, c.is_valid(0));
+ assert_eq!(8, c.value(1));
+ // overflows return None
+ assert_eq!(false, c.is_valid(2));
+ }
+
+ #[test]
fn test_cast_i32_to_i32() {
let a = Int32Array::from(vec![5, 6, 7, 8, 9]);
let array = Arc::new(a) as ArrayRef;
@@ -500,6 +568,88 @@ mod tests {
}
#[test]
+ fn test_cast_i32_to_list_i32() {
+ let a = Int32Array::from(vec![5, 6, 7, 8, 9]);
+ let array = Arc::new(a) as ArrayRef;
+ let b = cast(&array,
&DataType::List(Box::new(DataType::Int32))).unwrap();
+ assert_eq!(5, b.len());
+ let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
+ assert_eq!(0, arr.value_offset(0));
+ assert_eq!(1, arr.value_offset(1));
+ assert_eq!(2, arr.value_offset(2));
+ assert_eq!(3, arr.value_offset(3));
+ assert_eq!(4, arr.value_offset(4));
+ assert_eq!(1, arr.value_length(0));
+ assert_eq!(1, arr.value_length(1));
+ assert_eq!(1, arr.value_length(2));
+ assert_eq!(1, arr.value_length(3));
+ assert_eq!(1, arr.value_length(4));
+ let values = arr.values();
+ let c = values.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert_eq!(5, c.value(0));
+ assert_eq!(6, c.value(1));
+ assert_eq!(7, c.value(2));
+ assert_eq!(8, c.value(3));
+ assert_eq!(9, c.value(4));
+ }
+
+ #[test]
+ fn test_cast_i32_to_list_i32_nullable() {
+ let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8),
Some(9)]);
+ let array = Arc::new(a) as ArrayRef;
+ let b = cast(&array,
&DataType::List(Box::new(DataType::Int32))).unwrap();
+ assert_eq!(5, b.len());
+ assert_eq!(1, b.null_count());
+ let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
+ assert_eq!(0, arr.value_offset(0));
+ assert_eq!(1, arr.value_offset(1));
+ assert_eq!(2, arr.value_offset(2));
+ assert_eq!(3, arr.value_offset(3));
+ assert_eq!(4, arr.value_offset(4));
+ assert_eq!(1, arr.value_length(0));
+ assert_eq!(1, arr.value_length(1));
+ assert_eq!(1, arr.value_length(2));
+ assert_eq!(1, arr.value_length(3));
+ assert_eq!(1, arr.value_length(4));
+ let values = arr.values();
+ let c = values.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert_eq!(1, c.null_count());
+ assert_eq!(5, c.value(0));
+ assert_eq!(false, c.is_valid(1));
+ assert_eq!(7, c.value(2));
+ assert_eq!(8, c.value(3));
+ assert_eq!(9, c.value(4));
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Cast kernel does not yet support sliced (non-zero offset)
arrays"
+ )]
+ fn test_cast_i32_to_list_i32_nullable_sliced() {
+ let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), None]);
+ let array = Arc::new(a) as ArrayRef;
+ let array = array.slice(2, 3);
+ let b = cast(&array,
&DataType::List(Box::new(DataType::Int32))).unwrap();
+ assert_eq!(3, b.len());
+ assert_eq!(1, b.null_count());
+ let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
+ assert_eq!(0, arr.value_offset(0));
+ assert_eq!(1, arr.value_offset(1));
+ assert_eq!(2, arr.value_offset(2));
+ assert_eq!(1, arr.value_length(0));
+ assert_eq!(1, arr.value_length(1));
+ assert_eq!(1, arr.value_length(2));
+ let values = arr.values();
+ let c = values.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert_eq!(1, c.null_count());
+ assert_eq!(7, c.value(0));
+ assert_eq!(8, c.value(1));
+ // if one removes the non-zero-offset limitation, this assertion
passes when it
+ // shouldn't
+ assert_eq!(0, c.value(2));
+ }
+
+ #[test]
fn test_cast_utf_to_i32() {
let a = BinaryArray::from(vec!["5", "6", "seven", "8", "9.1"]);
let array = Arc::new(a) as ArrayRef;
@@ -543,4 +693,82 @@ mod tests {
let array = Arc::new(a) as ArrayRef;
cast(&array, &DataType::Timestamp(TimeUnit::Microsecond)).unwrap();
}
+
+ #[test]
+ fn test_cast_list_i32_to_list_u16() {
+ // Construct a value array
+ let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2,
100000000]).data();
+
+ let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice());
+
+ // Construct a list array from the above two
+ let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data = ArrayData::builder(list_data_type.clone())
+ .len(3)
+ .add_buffer(value_offsets.clone())
+ .add_child_data(value_data.clone())
+ .build();
+ let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef;
+
+ let cast_array =
+ cast(&list_array,
&DataType::List(Box::new(DataType::UInt16))).unwrap();
+ // 3 negative values should get lost when casting to unsigned,
+ // 1 value should overflow
+ assert_eq!(4, cast_array.null_count());
+ // offsets should be the same
+ assert_eq!(
+ list_array.data().buffers().to_vec(),
+ cast_array.data().buffers().to_vec()
+ );
+ let array = cast_array
+ .as_ref()
+ .as_any()
+ .downcast_ref::<ListArray>()
+ .unwrap();
+ assert_eq!(DataType::UInt16, array.value_type());
+ assert_eq!(4, array.values().null_count());
+ assert_eq!(3, array.value_length(0));
+ assert_eq!(3, array.value_length(1));
+ assert_eq!(2, array.value_length(2));
+ let values = array.values();
+ let u16arr = values.as_any().downcast_ref::<UInt16Array>().unwrap();
+ assert_eq!(8, u16arr.len());
+ assert_eq!(4, u16arr.null_count());
+
+ assert_eq!(0, u16arr.value(0));
+ assert_eq!(0, u16arr.value(1));
+ assert_eq!(0, u16arr.value(2));
+ assert_eq!(false, u16arr.is_valid(3));
+ assert_eq!(false, u16arr.is_valid(4));
+ assert_eq!(false, u16arr.is_valid(5));
+ assert_eq!(2, u16arr.value(6));
+ assert_eq!(false, u16arr.is_valid(7));
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Casting from Int32 to Timestamp(Microsecond) not supported"
+ )]
+ fn test_cast_list_i32_to_list_timestamp() {
+ // Construct a value array
+ let value_data =
+ Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8,
100000000]).data();
+
+ let value_offsets = Buffer::from(&[0, 3, 6, 9].to_byte_slice());
+
+ // Construct a list array from the above two
+ let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data = ArrayData::builder(list_data_type.clone())
+ .len(3)
+ .add_buffer(value_offsets.clone())
+ .add_child_data(value_data.clone())
+ .build();
+ let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef;
+
+ cast(
+ &list_array,
+
&DataType::List(Box::new(DataType::Timestamp(TimeUnit::Microsecond))),
+ )
+ .unwrap();
+ }
}