tustvold commented on a change in pull request #1444:
URL: https://github.com/apache/arrow-rs/pull/1444#discussion_r836257058



##########
File path: arrow/src/array/data.rs
##########
@@ -1130,6 +1124,97 @@ impl ArrayData {
         )
     }
 
+    /// Returns an iterator over validated Result<(i, type_id)> for
+    /// each element i in the `UnionArray`, returning an Err if the
+    /// type_id is invalid
+    fn for_each_valid_type_id(
+        &self,
+    ) -> impl Iterator<Item = Result<(usize, usize)>> + '_ {
+        assert!(matches!(self.data_type(), DataType::Union(_, _)));
+        let buffer = &self.buffers[0];
+        let required_len = self.len + self.offset;
+        assert!(buffer.len() / std::mem::size_of::<i8>() >= required_len);
+
+        // Justification: buffer size was validated above
+        let type_ids = unsafe { 
&(buffer.typed_data::<i8>()[self.offset..required_len]) };
+
+        let num_children = self.child_data.len();
+        type_ids.iter().enumerate().map(move |(i, &type_id)| {
+            let type_id: usize = type_id
+                .try_into()
+                .map_err(|_| {
+                    ArrowError::InvalidArgumentError(format!(
+                        "Type invariant failure: Could not convert type id {} 
to usize in slot {}",
+                        type_id, i))
+                })?;
+
+            if type_id >= num_children {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                        "Type invariant failure: type id {} at position {} 
invalid. Expected < {}",
+                        type_id, i, num_children)));
+            }
+
+            Ok((i, type_id))
+        })
+    }
+
+    /// Ensures that for each union element, the type_id is valid (between 
0..children.len())
+    fn validate_sparse_union_full(&self) -> Result<()> {
+        self.for_each_valid_type_id().try_for_each(|r| {
+            // No additional validation is needed other than the
+            // type_id is within range, which is done by the iterator
+            r?;
+            Ok(())
+        })
+    }
+
+    /// Ensures that for each union element, the offset is correct for
+    /// the corresponding child array
+    fn validate_dense_union_full(&self) -> Result<()> {
+        assert!(matches!(self.data_type(), DataType::Union(_, _)));
+        let buffer = &self.buffers[1];
+        let required_len = self.len + self.offset;
+        assert!(buffer.len() / std::mem::size_of::<i32>() >= required_len);
+
+        // Justification: buffer size was validated above
+        let offsets = unsafe { 
&(buffer.typed_data::<i32>()[self.offset..required_len]) };
+
+        let mut last_offset = None;
+        self.for_each_valid_type_id()
+            .zip(offsets.iter())
+            .try_for_each(|(r, &child_offset)| {
+                let (i, type_id) = r?;

Review comment:
       I think this should be checking nulls, the docs seem to indicate that 
nulls can have arbitrary type ids and offsets...




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to