alamb commented on a change in pull request #810:
URL: https://github.com/apache/arrow-rs/pull/810#discussion_r743888897
##########
File path: arrow/src/array/data.rs
##########
@@ -559,6 +570,452 @@ impl ArrayData {
)
}
}
+
+ /// "cheap" validation of an `ArrayData`. Ensures buffers are
+ /// sufficiently sized to store `len` + `offset` total elements of
+ /// `data_type` and performs other inexpensive consistency checks.
+ ///
+ /// This check is "cheap" in the sense that it does not validate the
+ /// contents of the buffers (e.g. that all offsets for UTF8 arrays
+ /// are within the bounds of the values buffer).
+ ///
+ /// TODO: add a validate_full that validates the offsets and valid utf8
data
+ pub fn validate(&self) -> Result<()> {
+ // Need at least this mich space in each buffer
+ let len_plus_offset = self.len + self.offset;
+
+ // Check that the data layout conforms to the spec
+ let layout = layout(&self.data_type);
+
+ // Will validate Union when conforms to new spec:
+ // https://github.com/apache/arrow-rs/issues/85
+ if matches!(&self.data_type, DataType::Union(_)) {
+ return Ok(());
+ }
+ if self.buffers.len() != layout.buffers.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Expected {} buffers in array of type {:?}, got {}",
+ layout.buffers.len(),
+ self.data_type,
+ self.buffers.len(),
+ )));
+ }
+
+ for (i, (buffer, spec)) in
+ self.buffers.iter().zip(layout.buffers.iter()).enumerate()
+ {
+ match spec {
+ BufferSpec::FixedWidth { byte_width } => {
+ let min_buffer_size = len_plus_offset
+ .checked_mul(*byte_width)
+ .expect("integer overflow computing min buffer size");
+
+ if buffer.len() < min_buffer_size {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Need at least {} bytes in buffers[{}] in array of
type {:?}, but got {}",
+ min_buffer_size, i, self.data_type, buffer.len()
+ )));
+ }
+ }
+ BufferSpec::VariableWidth => {
+ // not cheap to validate (need to look at the
+ // data). Partially checked in validate_offsets
+ // called below. Can check with `validate_full`
+ }
+ BufferSpec::BitMap => {
+ let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
+ if buffer.len() < min_buffer_size {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Need at least {} bytes for bitmap in buffers[{}]
in array of type {:?}, but got {}",
+ min_buffer_size, i, self.data_type, buffer.len()
+ )));
+ }
+ }
+ BufferSpec::AlwaysNull => {
+ // Nothing to validate
+ }
+ }
+ }
+
+ if self.null_count > self.len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "null_count {} for an array exceeds length of {} elements",
+ self.null_count, self.len
+ )));
+ }
+
+ // check null bit buffer size
+ if let Some(null_bit_buffer) = self.null_bitmap.as_ref() {
+ let needed_len = bit_util::ceil(len_plus_offset, 8);
+ if null_bit_buffer.len() < needed_len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "null_bit_buffer size too small. got {} needed {}",
+ null_bit_buffer.len(),
+ needed_len
+ )));
+ }
+ } else if self.null_count > 0 {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Array of type {} has {} nulls but no null bitmap",
+ self.data_type, self.null_count
+ )));
+ }
+
+ self.validate_child_data()?;
+
+ // Additional Type specific checks
+ match &self.data_type {
+ DataType::Utf8 | DataType::Binary => {
+ self.validate_offsets::<i32>(&self.buffers[0],
self.buffers[1].len())?;
+ }
+ DataType::LargeUtf8 | DataType::LargeBinary => {
+ self.validate_offsets::<i64>(&self.buffers[0],
self.buffers[1].len())?;
+ }
+ DataType::Dictionary(key_type, _value_type) => {
+ // At the moment, constructing a DictionaryArray will also
check this
+ if !DataType::is_integer(key_type) {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Dictionary values must be integer, but was {}",
+ key_type
+ )));
+ }
+ }
+ _ => {}
+ };
+
+ Ok(())
+ }
+
+ /// Does a cheap sanity check that the `self.len` values in `buffer` are
valid
+ /// offsets (of type T> into some other buffer of `values_length` bytes
long
+ fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
+ &self,
+ buffer: &Buffer,
+ values_length: usize,
+ ) -> Result<()> {
+ // Validate that there are the correct number of offsets for this
array's length
+ let required_offsets = self.len + self.offset + 1;
+
+ // An empty list-like array can have 0 offsets
+ if buffer.is_empty() {
+ return Ok(());
+ }
+
+ if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Offsets buffer size (bytes): {} isn't large enough for {}.
Length {} needs {}",
+ buffer.len(), self.data_type, self.len, required_offsets
+ )));
+ }
+
+ // Justification: buffer size was validated above
+ let offsets = unsafe { buffer.typed_data::<T>() };
+
+ let first_offset = offsets[0].to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Error converting offset[0] ({}) to usize for {}",
+ offsets[0], self.data_type
+ ))
+ })?;
+
+ let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Error converting offset[{}] ({}) to usize for {}",
+ self.len, offsets[self.len], self.data_type
+ ))
+ })?;
+
+ let data_extent = last_offset.checked_sub(first_offset).ok_or_else(|| {
Review comment:
That is a great idea. I did so in 1446674a45 (all tests that I expected
to fail still do, but with slightly different messages)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]