This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new bbd85ed3d29 Add `ListView` & `LargeListView` basic construction and
validation (#5664)
bbd85ed3d29 is described below
commit bbd85ed3d29601a293f5297736101e6e4460d548
Author: Kikkon <[email protected]>
AuthorDate: Tue Apr 30 23:48:33 2024 +0800
Add `ListView` & `LargeListView` basic construction and validation (#5664)
* feat: list view basic construction and validation
* fix: validate offset and sizes
* chore: remove unused check
* chore: add overflow checked
* chore: lint
---
arrow-data/src/data.rs | 70 ++++++++++++++++++++++++++++++--
arrow/tests/array_validation.rs | 88 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 154 insertions(+), 4 deletions(-)
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index 0ea74f789dd..d092fd049d7 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -22,9 +22,9 @@ use crate::bit_iterator::BitSliceIterator;
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
use arrow_schema::{ArrowError, DataType, UnionMode};
-use std::mem;
use std::ops::Range;
use std::sync::Arc;
+use std::{mem, usize};
use crate::{equal, validate_binary_view, validate_string_view};
@@ -929,6 +929,41 @@ impl ArrayData {
Ok(())
}
+ /// Does a cheap sanity check that the `self.len` values in `buffer` are
valid
+ /// offsets and sizes (of type T) into some other buffer of
`values_length` bytes long
+ fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num +
std::fmt::Display>(
+ &self,
+ values_length: usize,
+ ) -> Result<(), ArrowError> {
+ let offsets: &[T] = self.typed_buffer(0, self.len)?;
+ let sizes: &[T] = self.typed_buffer(1, self.len)?;
+ for i in 0..values_length {
+ let size = sizes[i].to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Error converting size[{}] ({}) to usize for {}",
+ i, sizes[i], self.data_type
+ ))
+ })?;
+ let offset = offsets[i].to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Error converting offset[{}] ({}) to usize for {}",
+ i, offsets[i], self.data_type
+ ))
+ })?;
+ if size
+ .checked_add(offset)
+ .expect("Offset and size have exceeded the usize boundary")
+ > values_length
+ {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Size {} at index {} is larger than the remaining values
for {}",
+ size, i, self.data_type
+ )));
+ }
+ }
+ Ok(())
+ }
+
/// Validates the layout of `child_data` ArrayData structures
fn validate_child_data(&self) -> Result<(), ArrowError> {
match &self.data_type {
@@ -942,6 +977,16 @@ impl ArrayData {
self.validate_offsets::<i64>(values_data.len)?;
Ok(())
}
+ DataType::ListView(field) => {
+ let values_data =
self.get_single_valid_child_data(field.data_type())?;
+ self.validate_offsets_and_sizes::<i32>(values_data.len)?;
+ Ok(())
+ }
+ DataType::LargeListView(field) => {
+ let values_data =
self.get_single_valid_child_data(field.data_type())?;
+ self.validate_offsets_and_sizes::<i64>(values_data.len)?;
+ Ok(())
+ }
DataType::FixedSizeList(field, list_size) => {
let values_data =
self.get_single_valid_child_data(field.data_type())?;
@@ -1546,9 +1591,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
DataType::BinaryView | DataType::Utf8View =>
DataTypeLayout::new_view(),
DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(),
// all in child data
DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
- DataType::ListView(_) | DataType::LargeListView(_) => {
- unimplemented!("ListView/LargeListView not implemented")
- }
+ DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
+ DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in
child data,
@@ -1661,6 +1705,24 @@ impl DataTypeLayout {
variadic: true,
}
}
+
+ /// Describes a list view type
+ pub fn new_list_view<T>() -> Self {
+ Self {
+ buffers: vec![
+ BufferSpec::FixedWidth {
+ byte_width: mem::size_of::<T>(),
+ alignment: mem::align_of::<T>(),
+ },
+ BufferSpec::FixedWidth {
+ byte_width: mem::size_of::<T>(),
+ alignment: mem::align_of::<T>(),
+ },
+ ],
+ can_contain_null_mask: true,
+ variadic: true,
+ }
+ }
}
/// Layout specification for a single data type buffer
diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs
index f5298f82e0a..41def9051d4 100644
--- a/arrow/tests/array_validation.rs
+++ b/arrow/tests/array_validation.rs
@@ -342,6 +342,94 @@ fn test_validate_offsets_last_too_large() {
.unwrap();
}
+/// Test that the list of type `data_type` generates correct offset and size
out of bounds errors
+fn check_list_view_offsets_sizes<T: ArrowNativeType>(
+ data_type: DataType,
+ offsets: Vec<T>,
+ sizes: Vec<T>,
+) {
+ let values: Int32Array = [Some(1), Some(2), Some(3),
Some(4)].into_iter().collect();
+ let offsets_buffer = Buffer::from_slice_ref(offsets);
+ let sizes_buffer = Buffer::from_slice_ref(sizes);
+ ArrayData::try_new(
+ data_type,
+ 4,
+ None,
+ 0,
+ vec![offsets_buffer, sizes_buffer],
+ vec![values.into_data()],
+ )
+ .unwrap();
+}
+
+#[test]
+#[should_panic(expected = "Size 3 at index 3 is larger than the remaining
values for ListView")]
+fn test_validate_list_view_offsets_sizes() {
+ let field_type = Field::new("f", DataType::Int32, true);
+ check_list_view_offsets_sizes::<i32>(
+ DataType::ListView(Arc::new(field_type)),
+ vec![0, 1, 1, 2],
+ vec![1, 1, 1, 3],
+ );
+}
+
+#[test]
+#[should_panic(
+ expected = "Size 3 at index 3 is larger than the remaining values for
LargeListView"
+)]
+fn test_validate_large_list_view_offsets_sizes() {
+ let field_type = Field::new("f", DataType::Int32, true);
+ check_list_view_offsets_sizes::<i64>(
+ DataType::LargeListView(Arc::new(field_type)),
+ vec![0, 1, 1, 2],
+ vec![1, 1, 1, 3],
+ );
+}
+
+#[test]
+#[should_panic(expected = "Error converting offset[1] (-1) to usize for
ListView")]
+fn test_validate_list_view_negative_offsets() {
+ let field_type = Field::new("f", DataType::Int32, true);
+ check_list_view_offsets_sizes::<i32>(
+ DataType::ListView(Arc::new(field_type)),
+ vec![0, -1, 1, 2],
+ vec![1, 1, 1, 3],
+ );
+}
+
+#[test]
+#[should_panic(expected = "Error converting size[2] (-1) to usize for
ListView")]
+fn test_validate_list_view_negative_sizes() {
+ let field_type = Field::new("f", DataType::Int32, true);
+ check_list_view_offsets_sizes::<i32>(
+ DataType::ListView(Arc::new(field_type)),
+ vec![0, 1, 1, 2],
+ vec![1, 1, -1, 3],
+ );
+}
+
+#[test]
+#[should_panic(expected = "Error converting offset[1] (-1) to usize for
LargeListView")]
+fn test_validate_large_list_view_negative_offsets() {
+ let field_type = Field::new("f", DataType::Int32, true);
+ check_list_view_offsets_sizes::<i64>(
+ DataType::LargeListView(Arc::new(field_type)),
+ vec![0, -1, 1, 2],
+ vec![1, 1, 1, 3],
+ );
+}
+
+#[test]
+#[should_panic(expected = "Error converting size[2] (-1) to usize for
LargeListView")]
+fn test_validate_large_list_view_negative_sizes() {
+ let field_type = Field::new("f", DataType::Int32, true);
+ check_list_view_offsets_sizes::<i64>(
+ DataType::LargeListView(Arc::new(field_type)),
+ vec![0, 1, 1, 2],
+ vec![1, 1, -1, 3],
+ );
+}
+
#[test]
#[should_panic(
expected = "Values length 4 is less than the length (2) multiplied by the
value size (2) for FixedSizeList"