This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new c5c34fa43 Don't recurse to children in ArrayData::try_new (#3248)
c5c34fa43 is described below

commit c5c34fa43141d01709485d3a008d3df93262c49c
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Dec 1 11:40:58 2022 +0000

    Don't recurse to children in ArrayData::try_new (#3248)
    
    * Don't recurse to children in ArrayData::validate_full
    
    * Add validate_data and update ArrayData::try_new
---
 arrow-data/src/data.rs          | 81 ++++++++++++++++++++++++++---------------
 arrow/tests/array_validation.rs | 34 -----------------
 2 files changed, 51 insertions(+), 64 deletions(-)

diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index 811696e4d..b230dfdb7 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -286,7 +286,7 @@ impl ArrayData {
     /// # Safety
     ///
     /// The input values *must* form a valid Arrow array for
-    /// `data_type`, or undefined behavior can results.
+    /// `data_type`, or undefined behavior can result.
     ///
     /// Note: This is a low level API and most users of the arrow
     /// crate should create arrays using the methods in the `array`
@@ -318,19 +318,20 @@ impl ArrayData {
 
         // Provide a force_validate mode
         #[cfg(feature = "force_validate")]
-        new_self.validate_full().unwrap();
+        new_self.validate_data().unwrap();
         new_self
     }
 
-    /// Create a new ArrayData, validating that the provided buffers
-    /// form a valid Arrow array of the specified data type.
+    /// Create a new ArrayData, validating that the provided buffers form a 
valid
+    /// Arrow array of the specified data type.
     ///
     /// If the number of nulls in `null_bit_buffer` is 0 then the 
null_bit_buffer
     /// is set to `None`.
     ///
-    /// Note: This is a low level API and most users of the arrow
-    /// crate should create arrays using the methods in the `array`
-    /// module.
+    /// Internally this calls through to [`Self::validate_data`]
+    ///
+    /// Note: This is a low level API and most users of the arrow crate should 
create
+    /// arrays using the builders found in 
[arrow_array](https://docs.rs/arrow-array)
     pub fn try_new(
         data_type: DataType,
         len: usize,
@@ -366,7 +367,10 @@ impl ArrayData {
         };
 
         // As the data is not trusted, do a full validation of its contents
-        new_self.validate_full()?;
+        // We don't need to validate children as we can assume that the
+        // [`ArrayData`] in `child_data` have already been validated through
+        // a call to `ArrayData::try_new` or created using unsafe
+        new_self.validate_data()?;
         Ok(new_self)
     }
 
@@ -617,8 +621,8 @@ impl ArrayData {
     /// contents of the buffers (e.g. that all offsets for UTF8 arrays
     /// are within the bounds of the values buffer).
     ///
-    /// See [ArrayData::validate_full] to validate fully the offset content
-    /// and the validitiy of utf8 data
+    /// See [ArrayData::validate_data] to validate fully the offset content
+    /// and the validity of utf8 data
     pub fn validate(&self) -> Result<(), ArrowError> {
         // Need at least this mich space in each buffer
         let len_plus_offset = self.len + self.offset;
@@ -954,35 +958,34 @@ impl ArrayData {
         Ok(values_data)
     }
 
-    /// "expensive" validation that ensures:
+    /// Validate that the data contained within this [`ArrayData`] is valid
     ///
     /// 1. Null count is correct
     /// 2. All offsets are valid
     /// 3. All String data is valid UTF-8
     /// 4. All dictionary offsets are valid
     ///
-    /// Does not (yet) check
-    /// 1. Union type_ids are valid see 
[#85](https://github.com/apache/arrow-rs/issues/85)
-    /// Note calls `validate()` internally
-    pub fn validate_full(&self) -> Result<(), ArrowError> {
-        // Check all buffer sizes prior to looking at them more deeply in this 
function
+    /// Internally this calls:
+    ///
+    /// * [`Self::validate`]
+    /// * [`Self::validate_nulls`]
+    /// * [`Self::validate_values`]
+    ///
+    /// Note: this does not recurse into children, for a recursive variant
+    /// see [`Self::validate_full`]
+    pub fn validate_data(&self) -> Result<(), ArrowError> {
         self.validate()?;
-
-        let null_bitmap_buffer = self
-            .null_bitmap
-            .as_ref()
-            .map(|null_bitmap| null_bitmap.buffer_ref());
-
-        let actual_null_count = count_nulls(null_bitmap_buffer, self.offset, 
self.len);
-        if actual_null_count != self.null_count {
-            return Err(ArrowError::InvalidArgumentError(format!(
-                "null_count value ({}) doesn't match actual number of nulls in 
array ({})",
-                self.null_count, actual_null_count
-            )));
-        }
-
+        self.validate_nulls()?;
         self.validate_values()?;
+        Ok(())
+    }
 
+    /// Performs a full recursive validation of this [`ArrayData`] and all its 
children
+    ///
+    /// This is equivalent to calling [`Self::validate_data`] on this 
[`ArrayData`]
+    /// and all its children recursively
+    pub fn validate_full(&self) -> Result<(), ArrowError> {
+        self.validate_data()?;
         // validate all children recursively
         self.child_data
             .iter()
@@ -995,10 +998,28 @@ impl ArrayData {
                     ))
                 })
             })?;
+        Ok(())
+    }
 
+    /// Validates the the null count is correct
+    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
+        let nulls = self.null_buffer();
+
+        let actual_null_count = count_nulls(nulls, self.offset, self.len);
+        if actual_null_count != self.null_count {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "null_count value ({}) doesn't match actual number of nulls in 
array ({})",
+                self.null_count, actual_null_count
+            )));
+        }
         Ok(())
     }
 
+    /// Validates the values stored within this [`ArrayData`] are valid
+    /// without recursing into child [`ArrayData`]
+    ///
+    /// Does not (yet) check
+    /// 1. Union type_ids are valid see 
[#85](https://github.com/apache/arrow-rs/issues/85)
     pub fn validate_values(&self) -> Result<(), ArrowError> {
         match &self.data_type {
             DataType::Utf8 => self.validate_utf8::<i32>(),
diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs
index 4faf69658..64c433a66 100644
--- a/arrow/tests/array_validation.rs
+++ b/arrow/tests/array_validation.rs
@@ -753,40 +753,6 @@ fn test_validate_list_negative_offsets() {
     .unwrap();
 }
 
-#[test]
-#[should_panic(expected = "Value at position 1 out of bounds: -1 (should be in 
[0, 1])")]
-/// test that children are validated recursively (aka bugs in child data of 
struct also are flagged)
-fn test_validate_recursive() {
-    // Form invalid dictionary array
-    let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
-    // -1 is not a valid index
-    let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect();
-
-    let dict_data_type = DataType::Dictionary(
-        Box::new(keys.data_type().clone()),
-        Box::new(values.data_type().clone()),
-    );
-
-    // purposely create an invalid child data
-    let dict_data = unsafe {
-        ArrayData::new_unchecked(
-            dict_data_type,
-            2,
-            None,
-            None,
-            0,
-            vec![keys.data().buffers()[0].clone()],
-            vec![values.into_data()],
-        )
-    };
-
-    // Now, try and create a struct with this invalid child data (and expect 
an error)
-    let data_type =
-        DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), 
true)]);
-
-    ArrayData::try_new(data_type, 1, None, 0, vec![], 
vec![dict_data]).unwrap();
-}
-
 /// returns a buffer initialized with some constant value for tests
 fn make_i32_buffer(n: usize) -> Buffer {
     Buffer::from_slice_ref(&vec![42i32; n])

Reply via email to