tustvold commented on a change in pull request #1444:
URL: https://github.com/apache/arrow-rs/pull/1444#discussion_r836252855



##########
File path: arrow/src/array/data.rs
##########
@@ -2537,6 +2639,102 @@ mod tests {
         .unwrap();
     }
 
+    #[test]
+    #[should_panic(
+        expected = "Need at least 4 bytes in buffers[0] in array of type Union"
+    )]
+    fn test_validate_dense_union_too_few_values() {
+        // not enough type_ids
+        let type_ids = vec![0i8, 1i8];
+        let offsets = vec![0i32, 0i32];
+        run_dense_union_test(type_ids, offsets);
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Type invariant failure: type id 2 at position 1 invalid. 
Expected < 2"
+    )]
+    fn test_validate_dense_union_bad_type_id() {
+        // typeid of 2 is not valid (only 1 and 0)
+        let type_ids = vec![0i8, 1i8, 2i8, 1i8];
+        let offsets = vec![0i32, 0i32, 1i32, 2i32];
+        run_dense_union_test(type_ids, offsets);
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Type invariant failure: Could not convert type id -1 to 
usize in slot 1"
+    )]
+    fn test_validate_dense_union_negative_type_id() {
+        // typeid of -1 is clearly not valid (only 1 and 0)
+        let type_ids = vec![1i8, 0i8, -1i8, 1i8];
+        let offsets = vec![0i32, 0i32, 1i32, 2i32];
+        run_dense_union_test(type_ids, offsets);
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Offset invariant failure: Could not convert offset -1 at 
position 1 to usize"
+    )]
+    fn test_validate_dense_union_negative_offset() {
+        let type_ids = vec![1i8, 0i8, 0i8, 1i8];
+        // offset of -1 is clearly not valid (only 1 and 0)
+        let offsets = vec![0i32, 0i32, -1i32, 2i32];
+        run_dense_union_test(type_ids, offsets);
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Value at position 1 out of bounds: 10 (child array 0 
length is 2"
+    )]
+    fn test_validate_dense_union_invalid_child_offset() {
+        let type_ids = vec![1i8, 0i8, 0i8, 1i8];
+        // child arrays only have 2 and 3 elements each
+        let offsets = vec![0i32, 0i32, 10i32, 2i32];
+        run_dense_union_test(type_ids, offsets);
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Offset invariant failure: position 2 invalid. 0 should be 
>= 1"
+    )]
+    fn test_validate_dense_union_non_increasing_offset() {

Review comment:
       This test should not panic, it is valid as no single child-array has 
non-increasing offsets.
   
   This is not valid
   ```
   type_ids = [0, 0, 0],
   offsets = [0, 1, 0],
   ```
   
   
   But this is
   ```
   type_ids = [0, 0, 1],
   offsets = [0, 1, 0],
   ```

##########
File path: arrow/src/array/data.rs
##########
@@ -1131,44 +1124,97 @@ impl ArrayData {
         )
     }
 
-    /// Ensures that for each union element, the offset is correct for
-    /// the corresponding child array
-    fn validate_dense_union_full(&self) -> Result<()> {
-        // safety justification is that the size of the buffers was validated 
in self.validate()
-        let type_ids = self.typed_offsets::<i8>(&self.buffers[0])?;
-        let offsets = self.typed_offsets::<i32>(&self.buffers[1])?;
+    /// Returns an iterator over validated Result<(i, type_id)> for
+    /// each element i in the `UnionArray`, returning an Err if the
+    /// type_id is invalid
+    fn for_each_valid_type_id(
+        &self,
+    ) -> impl Iterator<Item = Result<(usize, usize)>> + '_ {
+        assert!(matches!(self.data_type(), DataType::Union(_, _)));
+        let buffer = &self.buffers[0];
+        let required_len = self.len + self.offset;
+        assert!(buffer.len() / std::mem::size_of::<i8>() >= required_len);
 
-        type_ids.iter().enumerate().try_for_each(|(i, &type_id)| {
-            // this will panic if out of bounds. Could make a nicer error 
message
+        // Justification: buffer size was validated above
+        let type_ids = unsafe { 
&(buffer.typed_data::<i8>()[self.offset..required_len]) };
+
+        let num_children = self.child_data.len();
+        type_ids.iter().enumerate().map(move |(i, &type_id)| {
             let type_id: usize = type_id
                 .try_into()
                 .map_err(|_| {
                     ArrowError::InvalidArgumentError(format!(
-                        "Offset invariant failure: Could not convert type id 
{} to usize in slot {}",
+                        "Type invariant failure: Could not convert type id {} 
to usize in slot {}",
                         type_id, i))
                 })?;
 
-            let num_children = self.child_data[type_id].len();
-            let child_offset: usize = offsets[i]
-                .try_into()
-                .map_err(|_| {
-                    ArrowError::InvalidArgumentError(format!(
-                        "Offset invariant failure: Could not convert offset {} 
at position {} to usize",
-                        offsets[i], i))
-                })?;
+            if type_id >= num_children {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                        "Type invariant failure: type id {} at position {} 
invalid. Expected < {}",
+                        type_id, i, num_children)));
+            }
 
+            Ok((i, type_id))
+        })
+    }
 
-            if child_offset >= num_children {
-                Err(ArrowError::InvalidArgumentError(format!(
-                    "Value at position {} out of bounds: {} (child array {} 
length is {})",
-                    i, child_offset, type_id, num_children
-                )))
-            } else {
-                Ok(())
-            }
+    /// Ensures that for each union element, the type_id is valid (between 
0..children.len())
+    fn validate_sparse_union_full(&self) -> Result<()> {
+        self.for_each_valid_type_id().try_for_each(|r| {
+            // No additional validation is needed other than the
+            // type_id is within range, which is done by the iterator
+            r?;
+            Ok(())
         })
     }
 
+    /// Ensures that for each union element, the offset is correct for
+    /// the corresponding child array
+    fn validate_dense_union_full(&self) -> Result<()> {
+        assert!(matches!(self.data_type(), DataType::Union(_, _)));
+        let buffer = &self.buffers[1];
+        let required_len = self.len + self.offset;
+        assert!(buffer.len() / std::mem::size_of::<i32>() >= required_len);
+
+        // Justification: buffer size was validated above
+        let offsets = unsafe { 
&(buffer.typed_data::<i32>()[self.offset..required_len]) };
+
+        let mut last_offset = None;
+        self.for_each_valid_type_id()
+            .zip(offsets.iter())
+            .try_for_each(|(r, &child_offset)| {
+                let (i, type_id) = r?;
+
+                let child_offset: usize = child_offset
+                    .try_into()
+                    .map_err(|_| {
+                        ArrowError::InvalidArgumentError(format!(
+                            "Offset invariant failure: Could not convert 
offset {} at position {} to usize",
+                            child_offset, i))
+                    })?;
+
+                if let Some(last_offset) = last_offset.as_ref() {

Review comment:
       This is incorrect, the requirement is that for a given type the offsets 
must be increasing. But it is perfectly correct, in fact desirable, for the 
offsets array itself to not be increasing.
   
   e.g. the following would be valid
   
   ```
   types: [0, 1, 1, 0],
   offsets: [0, 0, 1, 0],
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to