alamb commented on a change in pull request #810:
URL: https://github.com/apache/arrow-rs/pull/810#discussion_r743123785



##########
File path: arrow/src/array/data.rs
##########
@@ -559,6 +570,452 @@ impl ArrayData {
             )
         }
     }
+
+    /// "cheap" validation of an `ArrayData`. Ensures buffers are
+    /// sufficiently sized to store `len` + `offset` total elements of
+    /// `data_type` and performs other inexpensive consistency checks.
+    ///
+    /// This check is "cheap" in the sense that it does not validate the
+    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
+    /// are within the bounds of the values buffer).
+    ///
+    /// TODO: add a validate_full that validates the offsets and valid utf8 
data
+    pub fn validate(&self) -> Result<()> {
+        // Need at least this mich space in each buffer
+        let len_plus_offset = self.len + self.offset;
+
+        // Check that the data layout conforms to the spec
+        let layout = layout(&self.data_type);
+
+        // Will validate Union when conforms to new spec:
+        // https://github.com/apache/arrow-rs/issues/85
+        if matches!(&self.data_type, DataType::Union(_)) {
+            return Ok(());
+        }
+        if self.buffers.len() != layout.buffers.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Expected {} buffers in array of type {:?}, got {}",
+                layout.buffers.len(),
+                self.data_type,
+                self.buffers.len(),
+            )));
+        }
+
+        for (i, (buffer, spec)) in
+            self.buffers.iter().zip(layout.buffers.iter()).enumerate()
+        {
+            match spec {
+                BufferSpec::FixedWidth { byte_width } => {
+                    let min_buffer_size = len_plus_offset
+                        .checked_mul(*byte_width)
+                        .expect("integer overflow computing min buffer size");
+
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes in buffers[{}] in array of 
type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::VariableWidth => {
+                    // not cheap to validate (need to look at the
+                    // data). Partially checked in validate_offsets
+                    // called below. Can check with `validate_full`
+                }
+                BufferSpec::BitMap => {
+                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes for bitmap in buffers[{}] 
in array of type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::AlwaysNull => {
+                    // Nothing to validate
+                }
+            }
+        }
+
+        if self.null_count > self.len {

Review comment:
       > I'm wondering what could happen if the null_count actually gives a 
different number than the validity buffer and whether this could lead to 
undefined behavior in some later operation. Most callers pass None anyway, so 
we calculate a number which is guaranteed to be in range.
   
   I suspect (but can not prove) that if `null_count` is inconsistent with the 
validity buffer then there may be incorrect answers but not undefined behavior 
(in the Rust sense).
   
   My plan will be: 
   1. In a separate PR (as it will not be backwards compatible) remove the 
`null_count` from `try_new()`: https://github.com/apache/arrow-rs/issues/911
   2. in the PR where I add the full on data checking (`validate_full()`) 
ensure that the declared null count matches the validity bitmap
   

##########
File path: arrow/src/datatypes/datatype.rs
##########
@@ -477,6 +477,15 @@ impl DataType {
         )
     }
 
+    /// Returns true if this type is integral: (UInt*, Unit*).
+    pub fn is_integer(t: &DataType) -> bool {

Review comment:
       Done in 8acd8d22a7

##########
File path: arrow/src/array/data.rs
##########
@@ -559,6 +570,452 @@ impl ArrayData {
             )
         }
     }
+
+    /// "cheap" validation of an `ArrayData`. Ensures buffers are
+    /// sufficiently sized to store `len` + `offset` total elements of
+    /// `data_type` and performs other inexpensive consistency checks.
+    ///
+    /// This check is "cheap" in the sense that it does not validate the
+    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
+    /// are within the bounds of the values buffer).
+    ///
+    /// TODO: add a validate_full that validates the offsets and valid utf8 
data
+    pub fn validate(&self) -> Result<()> {
+        // Need at least this mich space in each buffer
+        let len_plus_offset = self.len + self.offset;
+
+        // Check that the data layout conforms to the spec
+        let layout = layout(&self.data_type);
+
+        // Will validate Union when conforms to new spec:
+        // https://github.com/apache/arrow-rs/issues/85
+        if matches!(&self.data_type, DataType::Union(_)) {
+            return Ok(());
+        }
+        if self.buffers.len() != layout.buffers.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Expected {} buffers in array of type {:?}, got {}",
+                layout.buffers.len(),
+                self.data_type,
+                self.buffers.len(),
+            )));
+        }
+
+        for (i, (buffer, spec)) in
+            self.buffers.iter().zip(layout.buffers.iter()).enumerate()
+        {
+            match spec {
+                BufferSpec::FixedWidth { byte_width } => {
+                    let min_buffer_size = len_plus_offset
+                        .checked_mul(*byte_width)
+                        .expect("integer overflow computing min buffer size");
+
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes in buffers[{}] in array of 
type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::VariableWidth => {
+                    // not cheap to validate (need to look at the
+                    // data). Partially checked in validate_offsets
+                    // called below. Can check with `validate_full`
+                }
+                BufferSpec::BitMap => {
+                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes for bitmap in buffers[{}] 
in array of type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::AlwaysNull => {
+                    // Nothing to validate
+                }
+            }
+        }
+
+        if self.null_count > self.len {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "null_count {} for an array exceeds length of {} elements",
+                self.null_count, self.len
+            )));
+        }
+
+        // check null bit buffer size
+        if let Some(null_bit_buffer) = self.null_bitmap.as_ref() {
+            let needed_len = bit_util::ceil(len_plus_offset, 8);
+            if null_bit_buffer.len() < needed_len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "null_bit_buffer size too small. got {} needed {}",
+                    null_bit_buffer.len(),
+                    needed_len
+                )));
+            }
+        } else if self.null_count > 0 {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Array of type {} has {} nulls but no null bitmap",
+                self.data_type, self.null_count
+            )));
+        }
+
+        self.validate_child_data()?;
+
+        // Additional Type specific checks
+        match &self.data_type {
+            DataType::Utf8 | DataType::Binary => {
+                self.validate_offsets::<i32>(&self.buffers[0], 
self.buffers[1].len())?;
+            }
+            DataType::LargeUtf8 | DataType::LargeBinary => {
+                self.validate_offsets::<i64>(&self.buffers[0], 
self.buffers[1].len())?;
+            }
+            DataType::Dictionary(key_type, _value_type) => {
+                // At the moment, constructing a DictionaryArray will also 
check this
+                if !DataType::is_integer(key_type) {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Dictionary values must be integer, but was {}",
+                        key_type
+                    )));
+                }
+            }
+            _ => {}
+        };
+
+        Ok(())
+    }
+
+    /// Does a cheap sanity check that the `self.len` values in `buffer` are 
valid
+    /// offsets (of type T> into some other buffer of `values_length` bytes 
long
+    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
+        &self,
+        buffer: &Buffer,
+        values_length: usize,
+    ) -> Result<()> {
+        // Validate that there are the correct number of offsets for this 
array's length
+        let required_offsets = self.len + self.offset + 1;
+
+        // An empty list-like array can have 0 offsets
+        if buffer.is_empty() {
+            return Ok(());
+        }
+
+        if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Offsets buffer size (bytes): {} isn't large enough for {}. 
Length {} needs {}",
+                buffer.len(), self.data_type, self.len, required_offsets
+            )));
+        }
+
+        // Justification: buffer size was validated above
+        let offsets = unsafe { buffer.typed_data::<T>() };
+
+        let first_offset = offsets[0].to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Error converting offset[0] ({}) to usize for {}",
+                offsets[0], self.data_type
+            ))
+        })?;
+
+        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Error converting offset[{}] ({}) to usize for {}",
+                self.len, offsets[self.len], self.data_type
+            ))
+        })?;
+
+        let data_extent = last_offset.checked_sub(first_offset).ok_or_else(|| {

Review comment:
       🤔  I was able to write tests cases that hit both of these conditions, as 
below. The one I couldn't actually figure out how to write a test for was 
`first_offset > last_offset`, but perhaps that is what you are trying to say. 
   
   ```
       #[test]
       #[should_panic(expected = "Underflow computing offset extent 4 - 3 in 
Utf8")]
       fn test_validate_offsets_range_too_small() {
           let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
           // start offset is larger than end
           let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]);
           ArrayData::try_new(
               DataType::Utf8,
               2,
               None,
               None,
               0,
               vec![offsets_buffer, data_buffer],
               vec![],
           )
           .unwrap();
       }
   
       #[test]
       #[should_panic(
           expected = "Length spanned by offsets in Utf8 (10) is larger than 
the values array size (6)"
       )]
       fn test_validate_offsets_range_too_large() {
           let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
           // 10 is off the end of the buffer
           let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]);
           ArrayData::try_new(
               DataType::Utf8,
               2,
               None,
               None,
               0,
               vec![offsets_buffer, data_buffer],
               vec![],
           )
           .unwrap();
       }
   ```

##########
File path: arrow/src/datatypes/datatype.rs
##########
@@ -477,6 +477,15 @@ impl DataType {
         )
     }
 
+    /// Returns true if this type is integral: (UInt*, Unit*).
+    pub fn is_integer(t: &DataType) -> bool {

Review comment:
       Done in 8acd8d22a7 

##########
File path: arrow/src/array/data.rs
##########
@@ -559,6 +570,452 @@ impl ArrayData {
             )
         }
     }
+
+    /// "cheap" validation of an `ArrayData`. Ensures buffers are
+    /// sufficiently sized to store `len` + `offset` total elements of
+    /// `data_type` and performs other inexpensive consistency checks.
+    ///
+    /// This check is "cheap" in the sense that it does not validate the
+    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
+    /// are within the bounds of the values buffer).
+    ///
+    /// TODO: add a validate_full that validates the offsets and valid utf8 
data
+    pub fn validate(&self) -> Result<()> {
+        // Need at least this mich space in each buffer
+        let len_plus_offset = self.len + self.offset;
+
+        // Check that the data layout conforms to the spec
+        let layout = layout(&self.data_type);
+
+        // Will validate Union when conforms to new spec:
+        // https://github.com/apache/arrow-rs/issues/85
+        if matches!(&self.data_type, DataType::Union(_)) {
+            return Ok(());
+        }
+        if self.buffers.len() != layout.buffers.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Expected {} buffers in array of type {:?}, got {}",
+                layout.buffers.len(),
+                self.data_type,
+                self.buffers.len(),
+            )));
+        }
+
+        for (i, (buffer, spec)) in
+            self.buffers.iter().zip(layout.buffers.iter()).enumerate()
+        {
+            match spec {
+                BufferSpec::FixedWidth { byte_width } => {
+                    let min_buffer_size = len_plus_offset
+                        .checked_mul(*byte_width)
+                        .expect("integer overflow computing min buffer size");
+
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes in buffers[{}] in array of 
type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::VariableWidth => {
+                    // not cheap to validate (need to look at the
+                    // data). Partially checked in validate_offsets
+                    // called below. Can check with `validate_full`
+                }
+                BufferSpec::BitMap => {
+                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes for bitmap in buffers[{}] 
in array of type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::AlwaysNull => {
+                    // Nothing to validate
+                }
+            }
+        }
+
+        if self.null_count > self.len {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "null_count {} for an array exceeds length of {} elements",
+                self.null_count, self.len
+            )));
+        }
+
+        // check null bit buffer size
+        if let Some(null_bit_buffer) = self.null_bitmap.as_ref() {
+            let needed_len = bit_util::ceil(len_plus_offset, 8);
+            if null_bit_buffer.len() < needed_len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "null_bit_buffer size too small. got {} needed {}",
+                    null_bit_buffer.len(),
+                    needed_len
+                )));
+            }
+        } else if self.null_count > 0 {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Array of type {} has {} nulls but no null bitmap",
+                self.data_type, self.null_count
+            )));
+        }
+
+        self.validate_child_data()?;
+
+        // Additional Type specific checks
+        match &self.data_type {
+            DataType::Utf8 | DataType::Binary => {
+                self.validate_offsets::<i32>(&self.buffers[0], 
self.buffers[1].len())?;
+            }
+            DataType::LargeUtf8 | DataType::LargeBinary => {
+                self.validate_offsets::<i64>(&self.buffers[0], 
self.buffers[1].len())?;
+            }
+            DataType::Dictionary(key_type, _value_type) => {
+                // At the moment, constructing a DictionaryArray will also 
check this
+                if !DataType::is_dictionary_key_type(key_type) {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Dictionary values must be integer, but was {}",
+                        key_type
+                    )));
+                }
+            }
+            _ => {}
+        };
+
+        Ok(())
+    }
+
+    /// Does a cheap sanity check that the `self.len` values in `buffer` are 
valid
+    /// offsets (of type T> into some other buffer of `values_length` bytes 
long
+    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
+        &self,
+        buffer: &Buffer,
+        values_length: usize,
+    ) -> Result<()> {
+        // Validate that there are the correct number of offsets for this 
array's length
+        let required_offsets = self.len + self.offset + 1;
+
+        // An empty list-like array can have 0 offsets
+        if buffer.is_empty() {
+            return Ok(());
+        }
+
+        if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Offsets buffer size (bytes): {} isn't large enough for {}. 
Length {} needs {}",
+                buffer.len(), self.data_type, self.len, required_offsets
+            )));
+        }
+
+        // Justification: buffer size was validated above
+        let offsets = unsafe { buffer.typed_data::<T>() };

Review comment:
       This is an excellent spot -- fixed (with test) in 88a41b334f

##########
File path: arrow/src/array/data.rs
##########
@@ -559,6 +570,452 @@ impl ArrayData {
             )
         }
     }
+
+    /// "cheap" validation of an `ArrayData`. Ensures buffers are
+    /// sufficiently sized to store `len` + `offset` total elements of
+    /// `data_type` and performs other inexpensive consistency checks.
+    ///
+    /// This check is "cheap" in the sense that it does not validate the
+    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
+    /// are within the bounds of the values buffer).
+    ///
+    /// TODO: add a validate_full that validates the offsets and valid utf8 
data
+    pub fn validate(&self) -> Result<()> {
+        // Need at least this mich space in each buffer
+        let len_plus_offset = self.len + self.offset;
+
+        // Check that the data layout conforms to the spec
+        let layout = layout(&self.data_type);
+
+        // Will validate Union when conforms to new spec:
+        // https://github.com/apache/arrow-rs/issues/85
+        if matches!(&self.data_type, DataType::Union(_)) {
+            return Ok(());
+        }
+        if self.buffers.len() != layout.buffers.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Expected {} buffers in array of type {:?}, got {}",
+                layout.buffers.len(),
+                self.data_type,
+                self.buffers.len(),
+            )));
+        }
+
+        for (i, (buffer, spec)) in
+            self.buffers.iter().zip(layout.buffers.iter()).enumerate()
+        {
+            match spec {
+                BufferSpec::FixedWidth { byte_width } => {
+                    let min_buffer_size = len_plus_offset
+                        .checked_mul(*byte_width)
+                        .expect("integer overflow computing min buffer size");
+
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes in buffers[{}] in array of 
type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::VariableWidth => {
+                    // not cheap to validate (need to look at the
+                    // data). Partially checked in validate_offsets
+                    // called below. Can check with `validate_full`
+                }
+                BufferSpec::BitMap => {
+                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes for bitmap in buffers[{}] 
in array of type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::AlwaysNull => {
+                    // Nothing to validate
+                }
+            }
+        }
+
+        if self.null_count > self.len {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "null_count {} for an array exceeds length of {} elements",
+                self.null_count, self.len
+            )));
+        }
+
+        // check null bit buffer size
+        if let Some(null_bit_buffer) = self.null_bitmap.as_ref() {
+            let needed_len = bit_util::ceil(len_plus_offset, 8);
+            if null_bit_buffer.len() < needed_len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "null_bit_buffer size too small. got {} needed {}",
+                    null_bit_buffer.len(),
+                    needed_len
+                )));
+            }
+        } else if self.null_count > 0 {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Array of type {} has {} nulls but no null bitmap",
+                self.data_type, self.null_count
+            )));
+        }
+
+        self.validate_child_data()?;
+
+        // Additional Type specific checks
+        match &self.data_type {
+            DataType::Utf8 | DataType::Binary => {
+                self.validate_offsets::<i32>(&self.buffers[0], 
self.buffers[1].len())?;
+            }
+            DataType::LargeUtf8 | DataType::LargeBinary => {
+                self.validate_offsets::<i64>(&self.buffers[0], 
self.buffers[1].len())?;
+            }
+            DataType::Dictionary(key_type, _value_type) => {
+                // At the moment, constructing a DictionaryArray will also 
check this
+                if !DataType::is_integer(key_type) {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Dictionary values must be integer, but was {}",
+                        key_type
+                    )));
+                }
+            }
+            _ => {}
+        };
+
+        Ok(())
+    }
+
+    /// Does a cheap sanity check that the `self.len` values in `buffer` are 
valid
+    /// offsets (of type T> into some other buffer of `values_length` bytes 
long
+    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
+        &self,
+        buffer: &Buffer,
+        values_length: usize,
+    ) -> Result<()> {
+        // Validate that there are the correct number of offsets for this 
array's length
+        let required_offsets = self.len + self.offset + 1;
+
+        // An empty list-like array can have 0 offsets
+        if buffer.is_empty() {
+            return Ok(());
+        }
+
+        if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Offsets buffer size (bytes): {} isn't large enough for {}. 
Length {} needs {}",
+                buffer.len(), self.data_type, self.len, required_offsets
+            )));
+        }
+
+        // Justification: buffer size was validated above
+        let offsets = unsafe { buffer.typed_data::<T>() };
+
+        let first_offset = offsets[0].to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Error converting offset[0] ({}) to usize for {}",
+                offsets[0], self.data_type
+            ))
+        })?;
+
+        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Error converting offset[{}] ({}) to usize for {}",
+                self.len, offsets[self.len], self.data_type
+            ))
+        })?;
+
+        let data_extent = last_offset.checked_sub(first_offset).ok_or_else(|| {

Review comment:
       That is a great idea. I did so in 1446674a45 (all tests that I expected 
to fail still do, but with slightly different messages)
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to