This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new bbd85ed3d29 Add `ListView` & `LargeListView` basic construction and 
validation (#5664)
bbd85ed3d29 is described below

commit bbd85ed3d29601a293f5297736101e6e4460d548
Author: Kikkon <[email protected]>
AuthorDate: Tue Apr 30 23:48:33 2024 +0800

    Add `ListView` & `LargeListView` basic construction and validation (#5664)
    
    * feat: list view basic construction and validation
    
    * fix: validate offset and sizes
    
    * chore: remove unused check
    
    * chore: add overflow checked
    
    * chore: lint
---
 arrow-data/src/data.rs          | 70 ++++++++++++++++++++++++++++++--
 arrow/tests/array_validation.rs | 88 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+), 4 deletions(-)

diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index 0ea74f789dd..d092fd049d7 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -22,9 +22,9 @@ use crate::bit_iterator::BitSliceIterator;
 use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
 use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
 use arrow_schema::{ArrowError, DataType, UnionMode};
-use std::mem;
 use std::ops::Range;
 use std::sync::Arc;
+use std::{mem, usize};
 
 use crate::{equal, validate_binary_view, validate_string_view};
 
@@ -929,6 +929,41 @@ impl ArrayData {
         Ok(())
     }
 
+    /// Does a cheap sanity check that the `self.len` values in `buffer` are 
valid
+    /// offsets and sizes (of type T) into some other buffer of 
`values_length` bytes long
+    fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + 
std::fmt::Display>(
+        &self,
+        values_length: usize,
+    ) -> Result<(), ArrowError> {
+        let offsets: &[T] = self.typed_buffer(0, self.len)?;
+        let sizes: &[T] = self.typed_buffer(1, self.len)?;
+        for i in 0..values_length {
+            let size = sizes[i].to_usize().ok_or_else(|| {
+                ArrowError::InvalidArgumentError(format!(
+                    "Error converting size[{}] ({}) to usize for {}",
+                    i, sizes[i], self.data_type
+                ))
+            })?;
+            let offset = offsets[i].to_usize().ok_or_else(|| {
+                ArrowError::InvalidArgumentError(format!(
+                    "Error converting offset[{}] ({}) to usize for {}",
+                    i, offsets[i], self.data_type
+                ))
+            })?;
+            if size
+                .checked_add(offset)
+                .expect("Offset and size have exceeded the usize boundary")
+                > values_length
+            {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Size {} at index {} is larger than the remaining values 
for {}",
+                    size, i, self.data_type
+                )));
+            }
+        }
+        Ok(())
+    }
+
     /// Validates the layout of `child_data` ArrayData structures
     fn validate_child_data(&self) -> Result<(), ArrowError> {
         match &self.data_type {
@@ -942,6 +977,16 @@ impl ArrayData {
                 self.validate_offsets::<i64>(values_data.len)?;
                 Ok(())
             }
+            DataType::ListView(field) => {
+                let values_data = 
self.get_single_valid_child_data(field.data_type())?;
+                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
+                Ok(())
+            }
+            DataType::LargeListView(field) => {
+                let values_data = 
self.get_single_valid_child_data(field.data_type())?;
+                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
+                Ok(())
+            }
             DataType::FixedSizeList(field, list_size) => {
                 let values_data = 
self.get_single_valid_child_data(field.data_type())?;
 
@@ -1546,9 +1591,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
         DataType::BinaryView | DataType::Utf8View => 
DataTypeLayout::new_view(),
         DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), 
// all in child data
         DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
-        DataType::ListView(_) | DataType::LargeListView(_) => {
-            unimplemented!("ListView/LargeListView not implemented")
-        }
+        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
+        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
         DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
         DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
         DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in 
child data,
@@ -1661,6 +1705,24 @@ impl DataTypeLayout {
             variadic: true,
         }
     }
+
+    /// Describes a list view type
+    pub fn new_list_view<T>() -> Self {
+        Self {
+            buffers: vec![
+                BufferSpec::FixedWidth {
+                    byte_width: mem::size_of::<T>(),
+                    alignment: mem::align_of::<T>(),
+                },
+                BufferSpec::FixedWidth {
+                    byte_width: mem::size_of::<T>(),
+                    alignment: mem::align_of::<T>(),
+                },
+            ],
+            can_contain_null_mask: true,
+            variadic: true,
+        }
+    }
 }
 
 /// Layout specification for a single data type buffer
diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs
index f5298f82e0a..41def9051d4 100644
--- a/arrow/tests/array_validation.rs
+++ b/arrow/tests/array_validation.rs
@@ -342,6 +342,94 @@ fn test_validate_offsets_last_too_large() {
     .unwrap();
 }
 
+/// Test that the list of type `data_type` generates correct offset and size 
out of bounds errors
+fn check_list_view_offsets_sizes<T: ArrowNativeType>(
+    data_type: DataType,
+    offsets: Vec<T>,
+    sizes: Vec<T>,
+) {
+    let values: Int32Array = [Some(1), Some(2), Some(3), 
Some(4)].into_iter().collect();
+    let offsets_buffer = Buffer::from_slice_ref(offsets);
+    let sizes_buffer = Buffer::from_slice_ref(sizes);
+    ArrayData::try_new(
+        data_type,
+        4,
+        None,
+        0,
+        vec![offsets_buffer, sizes_buffer],
+        vec![values.into_data()],
+    )
+    .unwrap();
+}
+
+#[test]
+#[should_panic(expected = "Size 3 at index 3 is larger than the remaining 
values for ListView")]
+fn test_validate_list_view_offsets_sizes() {
+    let field_type = Field::new("f", DataType::Int32, true);
+    check_list_view_offsets_sizes::<i32>(
+        DataType::ListView(Arc::new(field_type)),
+        vec![0, 1, 1, 2],
+        vec![1, 1, 1, 3],
+    );
+}
+
+#[test]
+#[should_panic(
+    expected = "Size 3 at index 3 is larger than the remaining values for 
LargeListView"
+)]
+fn test_validate_large_list_view_offsets_sizes() {
+    let field_type = Field::new("f", DataType::Int32, true);
+    check_list_view_offsets_sizes::<i64>(
+        DataType::LargeListView(Arc::new(field_type)),
+        vec![0, 1, 1, 2],
+        vec![1, 1, 1, 3],
+    );
+}
+
+#[test]
+#[should_panic(expected = "Error converting offset[1] (-1) to usize for 
ListView")]
+fn test_validate_list_view_negative_offsets() {
+    let field_type = Field::new("f", DataType::Int32, true);
+    check_list_view_offsets_sizes::<i32>(
+        DataType::ListView(Arc::new(field_type)),
+        vec![0, -1, 1, 2],
+        vec![1, 1, 1, 3],
+    );
+}
+
+#[test]
+#[should_panic(expected = "Error converting size[2] (-1) to usize for 
ListView")]
+fn test_validate_list_view_negative_sizes() {
+    let field_type = Field::new("f", DataType::Int32, true);
+    check_list_view_offsets_sizes::<i32>(
+        DataType::ListView(Arc::new(field_type)),
+        vec![0, 1, 1, 2],
+        vec![1, 1, -1, 3],
+    );
+}
+
+#[test]
+#[should_panic(expected = "Error converting offset[1] (-1) to usize for 
LargeListView")]
+fn test_validate_large_list_view_negative_offsets() {
+    let field_type = Field::new("f", DataType::Int32, true);
+    check_list_view_offsets_sizes::<i64>(
+        DataType::LargeListView(Arc::new(field_type)),
+        vec![0, -1, 1, 2],
+        vec![1, 1, 1, 3],
+    );
+}
+
+#[test]
+#[should_panic(expected = "Error converting size[2] (-1) to usize for 
LargeListView")]
+fn test_validate_large_list_view_negative_sizes() {
+    let field_type = Field::new("f", DataType::Int32, true);
+    check_list_view_offsets_sizes::<i64>(
+        DataType::LargeListView(Arc::new(field_type)),
+        vec![0, 1, 1, 2],
+        vec![1, 1, -1, 3],
+    );
+}
+
 #[test]
 #[should_panic(
     expected = "Values length 4 is less than the length (2) multiplied by the 
value size (2) for FixedSizeList"

Reply via email to