This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7d929f0aec7 feat(ffi): add run end encoded arrays (#5632)
7d929f0aec7 is described below
commit 7d929f0aec7b0d905671e99964b6cd76dc69a787
Author: Filippo Rossi <[email protected]>
AuthorDate: Mon Apr 15 13:57:07 2024 +0200
feat(ffi): add run end encoded arrays (#5632)
* feat(ffi): add run end encoded arrays
* fix(ffi): add correct data type layout for run end encoded arrays
* test: add test_nullable_run_array
---
arrow-data/src/data.rs | 19 +++++++++----
arrow-schema/src/ffi.rs | 17 ++++++++++++
arrow/src/ffi.rs | 71 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 102 insertions(+), 5 deletions(-)
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index e227b168eee..358c44e41b4 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -1544,14 +1544,14 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
DataType::BinaryView | DataType::Utf8View =>
DataTypeLayout::new_view(),
- DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all
in child data
+ DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(),
// all in child data
DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
- DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child
data,
+ DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in
child data,
DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all
in child data,
DataType::Union(_, mode) => {
let type_ids = BufferSpec::FixedWidth {
@@ -1612,9 +1612,8 @@ impl DataTypeLayout {
}
/// Describes arrays which have no data of their own
- /// (e.g. FixedSizeList). Note such arrays may still have a Null
- /// Bitmap
- pub fn new_empty() -> Self {
+ /// but may still have a Null Bitmap (e.g. FixedSizeList)
+ pub fn new_nullable_empty() -> Self {
Self {
buffers: vec![],
can_contain_null_mask: true,
@@ -1622,6 +1621,16 @@ impl DataTypeLayout {
}
}
+ /// Describes arrays which have no data of their own
+ /// (e.g. RunEndEncoded).
+ pub fn new_empty() -> Self {
+ Self {
+ buffers: vec![],
+ can_contain_null_mask: false,
+ variadic: false,
+ }
+ }
+
/// Describes a basic numeric array where each element has a fixed
/// with offset buffer of type `T`, followed by a
/// variable width data buffer
diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs
index 8a18c77ea29..d12e237d792 100644
--- a/arrow-schema/src/ffi.rs
+++ b/arrow-schema/src/ffi.rs
@@ -424,6 +424,14 @@ impl TryFrom<&FFI_ArrowSchema> for DataType {
let map_keys_sorted = c_schema.map_keys_sorted();
DataType::Map(Arc::new(Field::try_from(c_child)?),
map_keys_sorted)
}
+ "+r" => {
+ let c_run_ends = c_schema.child(0);
+ let c_values = c_schema.child(1);
+ DataType::RunEndEncoded(
+ Arc::new(Field::try_from(c_run_ends)?),
+ Arc::new(Field::try_from(c_values)?),
+ )
+ }
// Parametrized types, requiring string parse
other => {
match other.splitn(2, ':').collect::<Vec<&str>>().as_slice() {
@@ -616,6 +624,10 @@ impl TryFrom<&DataType> for FFI_ArrowSchema {
.iter()
.map(FFI_ArrowSchema::try_from)
.collect::<Result<Vec<_>, ArrowError>>()?,
+ DataType::RunEndEncoded(run_ends, values) => vec![
+ FFI_ArrowSchema::try_from(run_ends.as_ref())?,
+ FFI_ArrowSchema::try_from(values.as_ref())?,
+ ],
_ => vec![],
};
let dictionary = if let DataType::Dictionary(_, value_data_type) =
dtype {
@@ -681,6 +693,7 @@ fn get_format_string(dtype: &DataType) -> Result<String,
ArrowError> {
DataType::LargeList(_) => Ok("+L".to_string()),
DataType::Struct(_) => Ok("+s".to_string()),
DataType::Map(_, _) => Ok("+m".to_string()),
+ DataType::RunEndEncoded(_, _) => Ok("+r".to_string()),
DataType::Dictionary(key_data_type, _) =>
get_format_string(key_data_type),
DataType::Union(fields, mode) => {
let formats = fields
@@ -807,6 +820,10 @@ mod tests {
DataType::Utf8,
true,
)])));
+ round_trip_type(DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Binary, true)),
+ ));
}
#[test]
diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs
index fe3f413924b..d33de9d655f 100644
--- a/arrow/src/ffi.rs
+++ b/arrow/src/ffi.rs
@@ -333,6 +333,11 @@ impl<'a> ImportedArrowArray<'a> {
.map(|(i, (_, field))| self.consume_child(i,
field.data_type()))
.collect::<Result<Vec<_>>>()
}
+ DataType::RunEndEncoded(run_ends_field, values_field) => Ok([
+ self.consume_child(0, run_ends_field.data_type())?,
+ self.consume_child(1, values_field.data_type())?,
+ ]
+ .to_vec()),
_ => Ok(Vec::new()),
}
}
@@ -468,6 +473,7 @@ mod tests {
use arrow_array::cast::AsArray;
use arrow_array::types::{Float64Type, Int32Type};
use arrow_array::*;
+ use arrow_buffer::NullBuffer;
use crate::compute::kernels;
use crate::datatypes::{Field, Int8Type};
@@ -1176,4 +1182,69 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_run_array() -> Result<()> {
+ let value_data =
+ PrimitiveArray::<Int8Type>::from_iter_values([10_i8, 11, 12, 13,
14, 15, 16, 17]);
+
+ // Construct a run_ends array:
+ let run_ends_values = [4_i32, 6, 7, 9, 13, 18, 20, 22];
+ let run_ends_data =
+
PrimitiveArray::<Int32Type>::from_iter_values(run_ends_values.iter().copied());
+
+ // Construct a run ends encoded array from the above two
+ let ree_array = RunArray::<Int32Type>::try_new(&run_ends_data,
&value_data).unwrap();
+
+ // export it
+ let (array, schema) = to_ffi(&ree_array.to_data())?;
+
+ // (simulate consumer) import it
+ let data = unsafe { from_ffi(array, &schema) }?;
+ let array = make_array(data);
+
+ // perform some operation
+ let array = array
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+ assert_eq!(array.data_type(), ree_array.data_type());
+ assert_eq!(array.run_ends().values(), ree_array.run_ends().values());
+ assert_eq!(array.values(), ree_array.values());
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_nullable_run_array() -> Result<()> {
+ let nulls = NullBuffer::from(vec![true, false, true, true, false]);
+ let value_data =
+ PrimitiveArray::<Int8Type>::new(vec![1_i8, 2, 3, 4, 5].into(),
Some(nulls));
+
+ // Construct a run_ends array:
+ let run_ends_values = [5_i32, 6, 7, 8, 10];
+ let run_ends_data =
+
PrimitiveArray::<Int32Type>::from_iter_values(run_ends_values.iter().copied());
+
+ // Construct a run ends encoded array from the above two
+ let ree_array = RunArray::<Int32Type>::try_new(&run_ends_data,
&value_data).unwrap();
+
+ // export it
+ let (array, schema) = to_ffi(&ree_array.to_data())?;
+
+ // (simulate consumer) import it
+ let data = unsafe { from_ffi(array, &schema) }?;
+ let array = make_array(data);
+
+ // perform some operation
+ let array = array
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .unwrap();
+ assert_eq!(array.data_type(), ree_array.data_type());
+ assert_eq!(array.run_ends().values(), ree_array.run_ends().values());
+ assert_eq!(array.values(), ree_array.values());
+
+ Ok(())
+ }
}