alamb commented on code in PR #8645:
URL: https://github.com/apache/arrow-rs/pull/8645#discussion_r2453426449


##########
arrow-data/src/equal/list_view.rs:
##########
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ArrayData;
+use crate::data::count_nulls;
+use crate::equal::equal_values;
+use arrow_buffer::ArrowNativeType;
+use num_integer::Integer;
+
+pub(super) fn list_view_equal<T: ArrowNativeType + Integer>(
+    lhs: &ArrayData,
+    rhs: &ArrayData,
+    lhs_start: usize,
+    rhs_start: usize,
+    len: usize,
+) -> bool {
+    let lhs_offsets = lhs.buffer::<T>(0);
+    let lhs_sizes = lhs.buffer::<T>(1);
+
+    let rhs_offsets = rhs.buffer::<T>(0);
+    let rhs_sizes = rhs.buffer::<T>(1);
+
+    let lhs_data = &lhs.child_data()[0];
+    let rhs_data = &rhs.child_data()[0];
+
+    let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len);
+    let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len);
+
+    if lhs_null_count != rhs_null_count {
+        return false;
+    }
+
+    if lhs_null_count == 0 {
+        // non-null pathway: all sizes must be equal, and all values must be 
equal
+        let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
+        let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];
+
+        if lhs_range_sizes.len() != rhs_range_sizes.len() {
+            return false;
+        }
+
+        // Check values for equality
+        let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
+        let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];
+
+        if lhs_range_offsets.len() != rhs_range_offsets.len() {
+            return false;
+        }
+
+        for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets
+            .iter()
+            .zip(rhs_range_offsets)
+            .zip(lhs_sizes)
+        {
+            let lhs_offset = lhs_offset.to_usize().unwrap();
+            let rhs_offset = rhs_offset.to_usize().unwrap();
+            let size = size.to_usize().unwrap();
+
+            // Check if offsets are valid for the given range
+            if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) 
{
+                return false;
+            }
+        }
+    } else {
+        // Need to integrate validity check in the inner loop.
+        // non-null pathway: all sizes must be equal, and all values must be 
equal
+        let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
+        let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];
+
+        let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len);
+        let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len);
+
+        // Sizes can differ if values are null
+        if lhs_range_sizes.len() != rhs_range_sizes.len() {

Review Comment:
   same as above



##########
arrow-select/src/concat.rs:
##########
@@ -885,6 +948,49 @@ mod tests {
         assert_eq!(array_result.as_ref(), &array_expected as &dyn Array);
     }
 
+    #[test]
+    fn test_concat_list_view_arrays() {
+        let list1 = vec![
+            (Some(vec![Some(-1), None])),
+            None,
+            Some(vec![Some(10), Some(20)]),
+        ];
+        let mut list1_array = ListViewBuilder::new(Int64Builder::new());
+        for v in list1.iter() {
+            list1_array.append_option(v.clone());
+        }
+        let list1_array = list1_array.finish();
+
+        let list2 = vec![
+            None,
+            Some(vec![Some(100), None]),
+            Some(vec![Some(102), Some(103)]),
+        ];
+        let mut list2_array = ListViewBuilder::new(Int64Builder::new());
+        for v in list2.iter() {
+            list2_array.append_option(v.clone());
+        }
+        let list2_array = list2_array.finish();
+
+        let list3 = vec![Some(vec![Some(1000), Some(1001)])];
+        let mut list3_array = ListViewBuilder::new(Int64Builder::new());
+        for v in list3.iter() {
+            list3_array.append_option(v.clone());
+        }
+        let list3_array = list3_array.finish();
+
+        let array_result = concat(&[&list1_array, &list2_array, 
&list3_array]).unwrap();

Review Comment:
   Could you also test concatenating sliced lists (that have a non zero offset)
   
   Something like
   
   ```rust
   let array_result = concat(&[&list1_array.slice(1,2), &list2_array, 
&list3_array.slice(2,1) ]).unwrap();
   ```



##########
arrow-data/src/transform/utils.rs:
##########
@@ -58,6 +58,13 @@ pub(super) unsafe fn get_last_offset<T: 
ArrowNativeType>(offset_buffer: &Mutable
     *unsafe { offsets.get_unchecked(offsets.len() - 1) }
 }
 
+#[inline]
+pub(super) fn get_last_value_or_default<T: ArrowNativeType>(offset_buffer: 
&MutableBuffer) -> T {
+    let (prefix, offsets, suffix) = unsafe { 
offset_buffer.as_slice().align_to::<T>() };

Review Comment:
   Can you please add comments here explaining why this unsafe is ok (follow 
from the above?)



##########
arrow-select/src/filter.rs:
##########
@@ -1370,6 +1396,60 @@ mod tests {
         assert_eq!(&make_array(expected), &result);
     }
 
+    #[test]
+    fn test_filter_list_view_array() {
+        // [[1, 2], null, [], [3,4]]
+        let mut list_array = 
ListViewBuilder::with_capacity(Int32Builder::with_capacity(6), 4);
+        list_array.append_value([Some(1), Some(2)]);
+        list_array.append_null();
+        list_array.append_value([]);
+        list_array.append_value([Some(3), Some(4)]);
+
+        let list_array = list_array.finish();
+        let predicate = BooleanArray::from_iter([true, true, false, true]);
+
+        // Filter result: [[1, 2], null, [3, 4]]
+        let filtered = filter(&list_array, &predicate)
+            .unwrap()
+            .as_list_view::<i32>()
+            .clone();
+
+        let mut expected = 
ListViewBuilder::with_capacity(Int32Builder::with_capacity(5), 3);
+        expected.append_value([Some(1), Some(2)]);
+        expected.append_null();
+        expected.append_value([Some(3), Some(4)]);
+        let expected = expected.finish();
+
+        assert_eq!(&filtered, &expected);
+    }
+
+    #[test]

Review Comment:
   Likewise can you please add a test for filtering sliced arrays?



##########
arrow-select/src/filter.rs:
##########
@@ -1370,6 +1396,60 @@ mod tests {
         assert_eq!(&make_array(expected), &result);
     }
 
+    #[test]
+    fn test_filter_list_view_array() {
+        // [[1, 2], null, [], [3,4]]
+        let mut list_array = 
ListViewBuilder::with_capacity(Int32Builder::with_capacity(6), 4);
+        list_array.append_value([Some(1), Some(2)]);
+        list_array.append_null();
+        list_array.append_value([]);
+        list_array.append_value([Some(3), Some(4)]);
+
+        let list_array = list_array.finish();
+        let predicate = BooleanArray::from_iter([true, true, false, true]);
+
+        // Filter result: [[1, 2], null, [3, 4]]

Review Comment:
   Can you also please add a test that filters out a non null element



##########
arrow-select/src/take.rs:
##########
@@ -621,6 +627,58 @@ where
     Ok(GenericListArray::<OffsetType::Native>::from(list_data))
 }
 
+fn take_list_view<IndexType, OffsetType>(
+    values: &GenericListViewArray<OffsetType::Native>,
+    indices: &PrimitiveArray<IndexType>,
+) -> Result<GenericListViewArray<OffsetType::Native>, ArrowError>
+where
+    IndexType: ArrowPrimitiveType,
+    OffsetType: ArrowPrimitiveType,
+    OffsetType::Native: OffsetSizeTrait,
+{
+    // Take executes only over the views.
+    let mut taken_offsets: Vec<OffsetType::Native> = 
Vec::with_capacity(indices.len());
+    let mut taken_sizes: Vec<OffsetType::Native> = 
Vec::with_capacity(indices.len());
+
+    // Initialize null buffer
+    let num_bytes = bit_util::ceil(indices.len(), 8);
+    let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, 
true);
+    let null_slice = null_buf.as_slice_mut();
+
+    for i in 0..indices.len() {

Review Comment:
   I wonder if you can make this faster by avoiding bounds checks 🤔 
   
   ```suggestion
       for (i, value) in indices.iter().enumerate() {
   ```



##########
arrow-data/src/equal/mod.rs:
##########
@@ -104,10 +105,9 @@ fn equal_values(
             byte_view_equal(lhs, rhs, lhs_start, rhs_start, len)
         }
         DataType::List(_) => list_equal::<i32>(lhs, rhs, lhs_start, rhs_start, 
len),
-        DataType::ListView(_) | DataType::LargeListView(_) => {
-            unimplemented!("ListView/LargeListView not yet implemented")
-        }
         DataType::LargeList(_) => list_equal::<i64>(lhs, rhs, lhs_start, 
rhs_start, len),
+        DataType::ListView(_) => list_view_equal::<i32>(lhs, rhs, lhs_start, 
rhs_start, len),
+        DataType::LargeListView(_) => list_view_equal::<i64>(lhs, rhs, 
lhs_start, rhs_start, len),

Review Comment:
   I think we need tests for the eq kernel as well



##########
arrow-data/src/transform/list_view.rs:
##########
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ArrayData;
+use crate::transform::_MutableArrayData;
+use crate::transform::utils::get_last_value_or_default;
+use arrow_buffer::ArrowNativeType;
+use num_integer::Integer;
+use num_traits::CheckedAdd;
+
+pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
+    array: &ArrayData,
+) -> crate::transform::Extend<'_> {
+    let offsets = array.buffer::<T>(0);
+    let sizes = array.buffer::<T>(1);
+    Box::new(
+        move |mutable: &mut _MutableArrayData, _index: usize, start: usize, 
len: usize| {
+            let offset_buffer = &mut mutable.buffer1;
+            let sizes_buffer = &mut mutable.buffer2;
+
+            for &offset in &offsets[start..start + len] {
+                offset_buffer.push(offset);
+            }
+
+            // sizes
+            for &size in &sizes[start..start + len] {
+                sizes_buffer.push(size);
+            }
+
+            // the beauty of views is that we don't need to copy child_data, 
we just splat
+            // the offsets and sizes.
+        },
+    )
+}
+
+pub(super) fn extend_nulls<T: ArrowNativeType>(mutable: &mut 
_MutableArrayData, len: usize) {
+    let offset_buffer = &mut mutable.buffer1;
+    let sizes_buffer = &mut mutable.buffer2;
+
+    let last_offset: T = get_last_value_or_default(offset_buffer);

Review Comment:
   could this be zero? Or does it have to repeat the same value as the previous 
entry?
   
   Lists need to have same value as the previous entry, but I am not sure about 
list views



##########
arrow-select/src/take.rs:
##########
@@ -1871,6 +1971,52 @@ mod tests {
         test_take_list_with_nulls!(i64, LargeList, LargeListArray);
     }
 
+    #[test]
+    fn test_test_take_list_view_reversed() {
+        // Take reversed indices
+        list_view_test_case! {
+            values: vec![
+                Some(vec![Some(1), None, Some(3)]),
+                None,
+                Some(vec![Some(7), Some(8), None]),
+            ],
+            indices: vec![Some(2), Some(1), Some(0)],
+            expected: vec![
+                Some(vec![Some(7), Some(8), None]),
+                None,
+                Some(vec![Some(1), None, Some(3)]),
+            ]
+        };
+    }
+
+    #[test]
+    fn test_take_list_view_null_indices() {
+        // Take with null indices
+        list_view_test_case! {
+            values: vec![
+                Some(vec![Some(1), None, Some(3)]),
+                None,
+                Some(vec![Some(7), Some(8), None]),
+            ],
+            indices: vec![None, Some(0), None],
+            expected: vec![None, Some(vec![Some(1), None, Some(3)]), None]
+        };
+    }
+
+    #[test]
+    fn test_take_list_view_null_values() {
+        // Take at null values
+        list_view_test_case! {
+            values: vec![
+                Some(vec![Some(1), None, Some(3)]),
+                None,
+                Some(vec![Some(7), Some(8), None]),
+            ],
+            indices: vec![Some(1), Some(1), Some(1), None, None],
+            expected: vec![None; 5]
+        };
+    }

Review Comment:
   Also here, please a test with sliced array / offset



##########
arrow-select/src/take.rs:
##########
@@ -621,6 +627,58 @@ where
     Ok(GenericListArray::<OffsetType::Native>::from(list_data))
 }
 
+fn take_list_view<IndexType, OffsetType>(
+    values: &GenericListViewArray<OffsetType::Native>,
+    indices: &PrimitiveArray<IndexType>,
+) -> Result<GenericListViewArray<OffsetType::Native>, ArrowError>
+where
+    IndexType: ArrowPrimitiveType,
+    OffsetType: ArrowPrimitiveType,
+    OffsetType::Native: OffsetSizeTrait,
+{
+    // Take executes only over the views.
+    let mut taken_offsets: Vec<OffsetType::Native> = 
Vec::with_capacity(indices.len());
+    let mut taken_sizes: Vec<OffsetType::Native> = 
Vec::with_capacity(indices.len());
+
+    // Initialize null buffer
+    let num_bytes = bit_util::ceil(indices.len(), 8);
+    let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, 
true);
+    let null_slice = null_buf.as_slice_mut();
+
+    for i in 0..indices.len() {
+        if indices.is_valid(i) {
+            let idx = indices
+                .value(i)
+                .to_usize()
+                .ok_or_else(|| ArrowError::ComputeError("Cast to usize 
failed".to_string()))?;
+            taken_offsets.push(values.value_offset(idx));
+            taken_sizes.push(values.value_size(idx));
+
+            if !values.is_valid(idx) {
+                bit_util::unset_bit(null_slice, i);
+            }
+        } else {

Review Comment:
   you can probably play games here with a special loop for indices with no 
nulls 



##########
arrow-select/src/concat.rs:
##########
@@ -206,6 +208,63 @@ fn concat_lists<OffsetSize: OffsetSizeTrait>(
     Ok(Arc::new(array))
 }
 
+fn concat_list_view<OffsetSize: OffsetSizeTrait>(
+    arrays: &[&dyn Array],
+    field: &FieldRef,
+) -> Result<ArrayRef, ArrowError> {
+    let mut output_len = 0;
+    let mut list_has_nulls = false;
+
+    let lists = arrays
+        .iter()
+        .map(|x| x.as_list_view::<OffsetSize>())
+        .inspect(|l| {
+            output_len += l.len();
+            list_has_nulls |= l.null_count() != 0;
+        })
+        .collect::<Vec<_>>();
+
+    let lists_nulls = list_has_nulls.then(|| {
+        let mut nulls = BooleanBufferBuilder::new(output_len);
+        for l in &lists {
+            match l.nulls() {
+                Some(n) => nulls.append_buffer(n.inner()),
+                None => nulls.append_n(l.len(), true),
+            }
+        }
+        NullBuffer::new(nulls.finish())
+    });
+
+    let values: Vec<&dyn Array> = lists.iter().map(|l| 
l.values().as_ref()).collect();

Review Comment:
   this allocation is unfortunate, but I see it is driven by `concat`'s API



##########
arrow-data/src/equal/list_view.rs:
##########
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ArrayData;
+use crate::data::count_nulls;
+use crate::equal::equal_values;
+use arrow_buffer::ArrowNativeType;
+use num_integer::Integer;
+
+pub(super) fn list_view_equal<T: ArrowNativeType + Integer>(
+    lhs: &ArrayData,
+    rhs: &ArrayData,
+    lhs_start: usize,
+    rhs_start: usize,
+    len: usize,
+) -> bool {
+    let lhs_offsets = lhs.buffer::<T>(0);
+    let lhs_sizes = lhs.buffer::<T>(1);
+
+    let rhs_offsets = rhs.buffer::<T>(0);
+    let rhs_sizes = rhs.buffer::<T>(1);
+
+    let lhs_data = &lhs.child_data()[0];
+    let rhs_data = &rhs.child_data()[0];
+
+    let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len);
+    let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len);
+
+    if lhs_null_count != rhs_null_count {
+        return false;
+    }
+
+    if lhs_null_count == 0 {
+        // non-null pathway: all sizes must be equal, and all values must be 
equal
+        let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
+        let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];
+
+        if lhs_range_sizes.len() != rhs_range_sizes.len() {
+            return false;
+        }
+

Review Comment:
   the comment above suggests you also meant for this path to check that the 
sizes are the same, but there is no actual comparsison
   
   Perhaps changing 
   
   ```rust
   
           if lhs_range_sizes.len() != rhs_range_sizes.len() {
               return false;
           }
   ```
   
   to this
   
   ```rust
   
           if lhs_range_sizes != rhs_range_sizes {
               return false;
           }
   ```
   
   Could short circut the value check more often



##########
arrow-data/src/transform/list_view.rs:
##########
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ArrayData;
+use crate::transform::_MutableArrayData;
+use crate::transform::utils::get_last_value_or_default;
+use arrow_buffer::ArrowNativeType;
+use num_integer::Integer;
+use num_traits::CheckedAdd;
+
+pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
+    array: &ArrayData,
+) -> crate::transform::Extend<'_> {
+    let offsets = array.buffer::<T>(0);
+    let sizes = array.buffer::<T>(1);
+    Box::new(
+        move |mutable: &mut _MutableArrayData, _index: usize, start: usize, 
len: usize| {
+            let offset_buffer = &mut mutable.buffer1;
+            let sizes_buffer = &mut mutable.buffer2;
+
+            for &offset in &offsets[start..start + len] {
+                offset_buffer.push(offset);
+            }
+
+            // sizes
+            for &size in &sizes[start..start + len] {
+                sizes_buffer.push(size);
+            }
+
+            // the beauty of views is that we don't need to copy child_data, 
we just splat
+            // the offsets and sizes.

Review Comment:
   🎇 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to