This is an automated email from the ASF dual-hosted git repository. tustvold pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push: new 7d929f0aec7 feat(ffi): add run end encoded arrays (#5632) 7d929f0aec7 is described below commit 7d929f0aec7b0d905671e99964b6cd76dc69a787 Author: Filippo Rossi <filippo.ro...@datadoghq.com> AuthorDate: Mon Apr 15 13:57:07 2024 +0200 feat(ffi): add run end encoded arrays (#5632) * feat(ffi): add run end encoded arrays * fix(ffi): add correct data type layout for run end encoded arrays * test: add test_nullable_run_array --- arrow-data/src/data.rs | 19 +++++++++---- arrow-schema/src/ffi.rs | 17 ++++++++++++ arrow/src/ffi.rs | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 5 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index e227b168eee..358c44e41b4 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1544,14 +1544,14 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::Utf8 => DataTypeLayout::new_binary::<i32>(), DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(), DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), - DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data + DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(), DataType::ListView(_) | DataType::LargeListView(_) => { unimplemented!("ListView/LargeListView not implemented") } DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(), DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(), - DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, + DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data, DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data, DataType::Union(_, mode) => { let type_ids = BufferSpec::FixedWidth { @@ -1612,9 +1612,8 @@ impl DataTypeLayout { } /// Describes arrays which have no data of their own - /// (e.g. FixedSizeList). Note such arrays may still have a Null - /// Bitmap - pub fn new_empty() -> Self { + /// but may still have a Null Bitmap (e.g. FixedSizeList) + pub fn new_nullable_empty() -> Self { Self { buffers: vec![], can_contain_null_mask: true, @@ -1622,6 +1621,16 @@ impl DataTypeLayout { } } + /// Describes arrays which have no data of their own + /// (e.g. RunEndEncoded). + pub fn new_empty() -> Self { + Self { + buffers: vec![], + can_contain_null_mask: false, + variadic: false, + } + } + /// Describes a basic numeric array where each element has a fixed /// with offset buffer of type `T`, followed by a /// variable width data buffer diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 8a18c77ea29..d12e237d792 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -424,6 +424,14 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { let map_keys_sorted = c_schema.map_keys_sorted(); DataType::Map(Arc::new(Field::try_from(c_child)?), map_keys_sorted) } + "+r" => { + let c_run_ends = c_schema.child(0); + let c_values = c_schema.child(1); + DataType::RunEndEncoded( + Arc::new(Field::try_from(c_run_ends)?), + Arc::new(Field::try_from(c_values)?), + ) + } // Parametrized types, requiring string parse other => { match other.splitn(2, ':').collect::<Vec<&str>>().as_slice() { @@ -616,6 +624,10 @@ impl TryFrom<&DataType> for FFI_ArrowSchema { .iter() .map(FFI_ArrowSchema::try_from) .collect::<Result<Vec<_>, ArrowError>>()?, + DataType::RunEndEncoded(run_ends, values) => vec![ + FFI_ArrowSchema::try_from(run_ends.as_ref())?, + FFI_ArrowSchema::try_from(values.as_ref())?, + ], _ => vec![], }; let dictionary = if let DataType::Dictionary(_, value_data_type) = dtype { @@ -681,6 +693,7 @@ fn get_format_string(dtype: &DataType) -> Result<String, ArrowError> { DataType::LargeList(_) => Ok("+L".to_string()), DataType::Struct(_) => Ok("+s".to_string()), DataType::Map(_, _) => Ok("+m".to_string()), + DataType::RunEndEncoded(_, _) => Ok("+r".to_string()), DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type), DataType::Union(fields, mode) => { let formats = fields @@ -807,6 +820,10 @@ mod tests { DataType::Utf8, true, )]))); + round_trip_type(DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Binary, true)), + )); } #[test] diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index fe3f413924b..d33de9d655f 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -333,6 +333,11 @@ impl<'a> ImportedArrowArray<'a> { .map(|(i, (_, field))| self.consume_child(i, field.data_type())) .collect::<Result<Vec<_>>>() } + DataType::RunEndEncoded(run_ends_field, values_field) => Ok([ + self.consume_child(0, run_ends_field.data_type())?, + self.consume_child(1, values_field.data_type())?, + ] + .to_vec()), _ => Ok(Vec::new()), } } @@ -468,6 +473,7 @@ mod tests { use arrow_array::cast::AsArray; use arrow_array::types::{Float64Type, Int32Type}; use arrow_array::*; + use arrow_buffer::NullBuffer; use crate::compute::kernels; use crate::datatypes::{Field, Int8Type}; @@ -1176,4 +1182,69 @@ mod tests { Ok(()) } + + #[test] + fn test_run_array() -> Result<()> { + let value_data = + PrimitiveArray::<Int8Type>::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); + + // Construct a run_ends array: + let run_ends_values = [4_i32, 6, 7, 9, 13, 18, 20, 22]; + let run_ends_data = + PrimitiveArray::<Int32Type>::from_iter_values(run_ends_values.iter().copied()); + + // Construct a run ends encoded array from the above two + let ree_array = RunArray::<Int32Type>::try_new(&run_ends_data, &value_data).unwrap(); + + // export it + let (array, schema) = to_ffi(&ree_array.to_data())?; + + // (simulate consumer) import it + let data = unsafe { from_ffi(array, &schema) }?; + let array = make_array(data); + + // perform some operation + let array = array + .as_any() + .downcast_ref::<RunArray<Int32Type>>() + .unwrap(); + assert_eq!(array.data_type(), ree_array.data_type()); + assert_eq!(array.run_ends().values(), ree_array.run_ends().values()); + assert_eq!(array.values(), ree_array.values()); + + Ok(()) + } + + #[test] + fn test_nullable_run_array() -> Result<()> { + let nulls = NullBuffer::from(vec![true, false, true, true, false]); + let value_data = + PrimitiveArray::<Int8Type>::new(vec![1_i8, 2, 3, 4, 5].into(), Some(nulls)); + + // Construct a run_ends array: + let run_ends_values = [5_i32, 6, 7, 8, 10]; + let run_ends_data = + PrimitiveArray::<Int32Type>::from_iter_values(run_ends_values.iter().copied()); + + // Construct a run ends encoded array from the above two + let ree_array = RunArray::<Int32Type>::try_new(&run_ends_data, &value_data).unwrap(); + + // export it + let (array, schema) = to_ffi(&ree_array.to_data())?; + + // (simulate consumer) import it + let data = unsafe { from_ffi(array, &schema) }?; + let array = make_array(data); + + // perform some operation + let array = array + .as_any() + .downcast_ref::<RunArray<Int32Type>>() + .unwrap(); + assert_eq!(array.data_type(), ree_array.data_type()); + assert_eq!(array.run_ends().values(), ree_array.run_ends().values()); + assert_eq!(array.values(), ree_array.values()); + + Ok(()) + } }