tustvold commented on a change in pull request #1444:
URL: https://github.com/apache/arrow-rs/pull/1444#discussion_r836251359
##########
File path: arrow/src/array/data.rs
##########
@@ -1131,44 +1124,97 @@ impl ArrayData {
)
}
- /// Ensures that for each union element, the offset is correct for
- /// the corresponding child array
- fn validate_dense_union_full(&self) -> Result<()> {
- // safety justification is that the size of the buffers was validated
in self.validate()
- let type_ids = self.typed_offsets::<i8>(&self.buffers[0])?;
- let offsets = self.typed_offsets::<i32>(&self.buffers[1])?;
+ /// Returns an iterator over validated Result<(i, type_id)> for
+ /// each element i in the `UnionArray`, returning an Err if the
+ /// type_id is invalid
+ fn for_each_valid_type_id(
+ &self,
+ ) -> impl Iterator<Item = Result<(usize, usize)>> + '_ {
+ assert!(matches!(self.data_type(), DataType::Union(_, _)));
+ let buffer = &self.buffers[0];
+ let required_len = self.len + self.offset;
+ assert!(buffer.len() / std::mem::size_of::<i8>() >= required_len);
- type_ids.iter().enumerate().try_for_each(|(i, &type_id)| {
- // this will panic if out of bounds. Could make a nicer error
message
+ // Justification: buffer size was validated above
+ let type_ids = unsafe {
&(buffer.typed_data::<i8>()[self.offset..required_len]) };
+
+ let num_children = self.child_data.len();
+ type_ids.iter().enumerate().map(move |(i, &type_id)| {
let type_id: usize = type_id
.try_into()
.map_err(|_| {
ArrowError::InvalidArgumentError(format!(
- "Offset invariant failure: Could not convert type id
{} to usize in slot {}",
+ "Type invariant failure: Could not convert type id {}
to usize in slot {}",
type_id, i))
})?;
- let num_children = self.child_data[type_id].len();
- let child_offset: usize = offsets[i]
- .try_into()
- .map_err(|_| {
- ArrowError::InvalidArgumentError(format!(
- "Offset invariant failure: Could not convert offset {}
at position {} to usize",
- offsets[i], i))
- })?;
+ if type_id >= num_children {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Type invariant failure: type id {} at position {}
invalid. Expected < {}",
+ type_id, i, num_children)));
+ }
+ Ok((i, type_id))
+ })
+ }
- if child_offset >= num_children {
- Err(ArrowError::InvalidArgumentError(format!(
- "Value at position {} out of bounds: {} (child array {}
length is {})",
- i, child_offset, type_id, num_children
- )))
- } else {
- Ok(())
- }
+ /// Ensures that for each union element, the type_id is valid (between
0..children.len())
+ fn validate_sparse_union_full(&self) -> Result<()> {
+ self.for_each_valid_type_id().try_for_each(|r| {
+ // No additional validation is needed other than the
+ // type_id is within range, which is done by the iterator
+ r?;
+ Ok(())
})
}
+ /// Ensures that for each union element, the offset is correct for
+ /// the corresponding child array
+ fn validate_dense_union_full(&self) -> Result<()> {
+ assert!(matches!(self.data_type(), DataType::Union(_, _)));
+ let buffer = &self.buffers[1];
+ let required_len = self.len + self.offset;
+ assert!(buffer.len() / std::mem::size_of::<i32>() >= required_len);
+
+ // Justification: buffer size was validated above
+ let offsets = unsafe {
&(buffer.typed_data::<i32>()[self.offset..required_len]) };
+
+ let mut last_offset = None;
+ self.for_each_valid_type_id()
+ .zip(offsets.iter())
+ .try_for_each(|(r, &child_offset)| {
+ let (i, type_id) = r?;
+
+ let child_offset: usize = child_offset
+ .try_into()
+ .map_err(|_| {
+ ArrowError::InvalidArgumentError(format!(
+ "Offset invariant failure: Could not convert
offset {} at position {} to usize",
+ child_offset, i))
+ })?;
+
+ if let Some(last_offset) = last_offset.as_ref() {
Review comment:
This is incorrect, the requirement is that for a given type the offsets
must be increasing. But it is perfectly correct, in fact desirable, for the
offsets array itself to not be increasing.
e.g. the following would be valid
```
types: [0, 1, 1, 0],
offsets: [0, 0, 1, 0],
```
Otherwise the dense representation is no different from the sparse
representation
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]