alamb commented on a change in pull request #885:
URL: https://github.com/apache/arrow-rs/pull/885#discussion_r739655539
##########
File path: arrow/src/array/array_union.rs
##########
@@ -74,31 +74,38 @@ impl UnionArray {
let (field_types, field_values): (Vec<_>, Vec<_>) =
child_arrays.into_iter().unzip();
let len = type_ids.len();
- let mut builder = ArrayData::builder(DataType::Union(field_types))
+
+ let mode = if value_offsets.is_some() {
+ UnionMode::Dense
+ } else {
+ UnionMode::Sparse
+ };
+
+ let mut builder = ArrayData::builder(DataType::Union(field_types,
mode))
.add_buffer(type_ids)
.child_data(field_values.into_iter().map(|a|
a.data().clone()).collect())
.len(len);
if let Some(bitmap) = bitmap_data {
builder = builder.null_bit_buffer(bitmap)
}
- let data = unsafe {
+ let data = {
match value_offsets {
- Some(b) => builder.add_buffer(b).build_unchecked(),
- None => builder.build_unchecked(),
+ Some(b) => builder.add_buffer(b).build().unwrap(),
+ None => builder.build().unwrap(),
}
};
Self::from(data)
}
- /// Attempts to create a new `UnionArray` and validates the inputs
provided.
+
+ /// Attempts to create a new `UnionArray`, validating the inputs provided.
pub fn try_new(
type_ids: Buffer,
value_offsets: Option<Buffer>,
child_arrays: Vec<(Field, ArrayRef)>,
bitmap: Option<Buffer>,
) -> Result<Self> {
if let Some(b) = &value_offsets {
- let nulls = count_nulls(bitmap.as_ref(), 0, type_ids.len());
- if ((type_ids.len() - nulls) * 4) != b.len() {
+ if ((type_ids.len()) * 4) != b.len() {
Review comment:
here is the core format change -- the length of `type_ids` must match
the length of the `values` (rather than skipping nulls)
##########
File path: arrow/src/array/array_union.rs
##########
@@ -65,7 +65,7 @@ impl UnionArray {
/// In both of the cases above we are accepting `Buffer`'s which are
assumed to be representing
/// `i8` and `i32` values respectively. `Buffer` objects are untyped and
no attempt is made
/// to ensure that the data provided is valid.
- pub fn new(
+ pub unsafe fn new_unchecked(
Review comment:
here is the API change
##########
File path: arrow/src/array/data.rs
##########
@@ -941,11 +947,26 @@ fn layout(data_type: &DataType) -> DataTypeLayout {
DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all
in child data
DataType::LargeList(_) =>
DataTypeLayout::new_fixed_width(size_of::<i32>()),
DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child
data,
- DataType::Union(_) => {
- DataTypeLayout::new_fixed_width(size_of::<u8>())
- // Note sparse unions only have one buffer (u8) type_ids,
- // and dense unions have 2 (type_ids as well as offsets).
- // https://github.com/apache/arrow-rs/issues/85
+ DataType::Union(_, mode) => {
Review comment:
we can now validate the buffer layout based on `DataType`!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]