This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c5c34fa43 Don't recurse to children in ArrayData::try_new (#3248)
c5c34fa43 is described below
commit c5c34fa43141d01709485d3a008d3df93262c49c
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Dec 1 11:40:58 2022 +0000
Don't recurse to children in ArrayData::try_new (#3248)
* Don't recurse to children in ArrayData::validate_full
* Add validate_data and update ArrayData::try_new
---
arrow-data/src/data.rs | 81 ++++++++++++++++++++++++++---------------
arrow/tests/array_validation.rs | 34 -----------------
2 files changed, 51 insertions(+), 64 deletions(-)
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index 811696e4d..b230dfdb7 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -286,7 +286,7 @@ impl ArrayData {
/// # Safety
///
/// The input values *must* form a valid Arrow array for
- /// `data_type`, or undefined behavior can results.
+ /// `data_type`, or undefined behavior can result.
///
/// Note: This is a low level API and most users of the arrow
/// crate should create arrays using the methods in the `array`
@@ -318,19 +318,20 @@ impl ArrayData {
// Provide a force_validate mode
#[cfg(feature = "force_validate")]
- new_self.validate_full().unwrap();
+ new_self.validate_data().unwrap();
new_self
}
- /// Create a new ArrayData, validating that the provided buffers
- /// form a valid Arrow array of the specified data type.
+ /// Create a new ArrayData, validating that the provided buffers form a
valid
+ /// Arrow array of the specified data type.
///
/// If the number of nulls in `null_bit_buffer` is 0 then the
null_bit_buffer
/// is set to `None`.
///
- /// Note: This is a low level API and most users of the arrow
- /// crate should create arrays using the methods in the `array`
- /// module.
+ /// Internally this calls through to [`Self::validate_data`]
+ ///
+ /// Note: This is a low level API and most users of the arrow crate should
create
+ /// arrays using the builders found in
[arrow_array](https://docs.rs/arrow-array)
pub fn try_new(
data_type: DataType,
len: usize,
@@ -366,7 +367,10 @@ impl ArrayData {
};
// As the data is not trusted, do a full validation of its contents
- new_self.validate_full()?;
+ // We don't need to validate children as we can assume that the
+ // [`ArrayData`] in `child_data` have already been validated through
+ // a call to `ArrayData::try_new` or created using unsafe
+ new_self.validate_data()?;
Ok(new_self)
}
@@ -617,8 +621,8 @@ impl ArrayData {
/// contents of the buffers (e.g. that all offsets for UTF8 arrays
/// are within the bounds of the values buffer).
///
- /// See [ArrayData::validate_full] to validate fully the offset content
- /// and the validitiy of utf8 data
+ /// See [ArrayData::validate_data] to validate fully the offset content
+ /// and the validity of utf8 data
pub fn validate(&self) -> Result<(), ArrowError> {
// Need at least this mich space in each buffer
let len_plus_offset = self.len + self.offset;
@@ -954,35 +958,34 @@ impl ArrayData {
Ok(values_data)
}
- /// "expensive" validation that ensures:
+ /// Validate that the data contained within this [`ArrayData`] is valid
///
/// 1. Null count is correct
/// 2. All offsets are valid
/// 3. All String data is valid UTF-8
/// 4. All dictionary offsets are valid
///
- /// Does not (yet) check
- /// 1. Union type_ids are valid see
[#85](https://github.com/apache/arrow-rs/issues/85)
- /// Note calls `validate()` internally
- pub fn validate_full(&self) -> Result<(), ArrowError> {
- // Check all buffer sizes prior to looking at them more deeply in this
function
+ /// Internally this calls:
+ ///
+ /// * [`Self::validate`]
+ /// * [`Self::validate_nulls`]
+ /// * [`Self::validate_values`]
+ ///
+ /// Note: this does not recurse into children, for a recursive variant
+ /// see [`Self::validate_full`]
+ pub fn validate_data(&self) -> Result<(), ArrowError> {
self.validate()?;
-
- let null_bitmap_buffer = self
- .null_bitmap
- .as_ref()
- .map(|null_bitmap| null_bitmap.buffer_ref());
-
- let actual_null_count = count_nulls(null_bitmap_buffer, self.offset,
self.len);
- if actual_null_count != self.null_count {
- return Err(ArrowError::InvalidArgumentError(format!(
- "null_count value ({}) doesn't match actual number of nulls in
array ({})",
- self.null_count, actual_null_count
- )));
- }
-
+ self.validate_nulls()?;
self.validate_values()?;
+ Ok(())
+ }
+ /// Performs a full recursive validation of this [`ArrayData`] and all its
children
+ ///
+ /// This is equivalent to calling [`Self::validate_data`] on this
[`ArrayData`]
+ /// and all its children recursively
+ pub fn validate_full(&self) -> Result<(), ArrowError> {
+ self.validate_data()?;
// validate all children recursively
self.child_data
.iter()
@@ -995,10 +998,28 @@ impl ArrayData {
))
})
})?;
+ Ok(())
+ }
+ /// Validates the the null count is correct
+ pub fn validate_nulls(&self) -> Result<(), ArrowError> {
+ let nulls = self.null_buffer();
+
+ let actual_null_count = count_nulls(nulls, self.offset, self.len);
+ if actual_null_count != self.null_count {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "null_count value ({}) doesn't match actual number of nulls in
array ({})",
+ self.null_count, actual_null_count
+ )));
+ }
Ok(())
}
+ /// Validates the values stored within this [`ArrayData`] are valid
+ /// without recursing into child [`ArrayData`]
+ ///
+ /// Does not (yet) check
+ /// 1. Union type_ids are valid see
[#85](https://github.com/apache/arrow-rs/issues/85)
pub fn validate_values(&self) -> Result<(), ArrowError> {
match &self.data_type {
DataType::Utf8 => self.validate_utf8::<i32>(),
diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs
index 4faf69658..64c433a66 100644
--- a/arrow/tests/array_validation.rs
+++ b/arrow/tests/array_validation.rs
@@ -753,40 +753,6 @@ fn test_validate_list_negative_offsets() {
.unwrap();
}
-#[test]
-#[should_panic(expected = "Value at position 1 out of bounds: -1 (should be in
[0, 1])")]
-/// test that children are validated recursively (aka bugs in child data of
struct also are flagged)
-fn test_validate_recursive() {
- // Form invalid dictionary array
- let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
- // -1 is not a valid index
- let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect();
-
- let dict_data_type = DataType::Dictionary(
- Box::new(keys.data_type().clone()),
- Box::new(values.data_type().clone()),
- );
-
- // purposely create an invalid child data
- let dict_data = unsafe {
- ArrayData::new_unchecked(
- dict_data_type,
- 2,
- None,
- None,
- 0,
- vec![keys.data().buffers()[0].clone()],
- vec![values.into_data()],
- )
- };
-
- // Now, try and create a struct with this invalid child data (and expect
an error)
- let data_type =
- DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(),
true)]);
-
- ArrayData::try_new(data_type, 1, None, 0, vec![],
vec![dict_data]).unwrap();
-}
-
/// returns a buffer initialized with some constant value for tests
fn make_i32_buffer(n: usize) -> Buffer {
Buffer::from_slice_ref(&vec![42i32; n])