This is an automated email from the ASF dual-hosted git repository.
Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new da09872193 Implement native interleave for ListView (#9558)
da09872193 is described below
commit da09872193bb3805a3177678bf9005ddeea2333d
Author: Vegard Stikbakke <[email protected]>
AuthorDate: Wed May 20 03:44:03 2026 +0200
Implement native interleave for ListView (#9558)
This PR adds a native implementation of interleave for the ListView type
which uses a good heuristic thanks to @asubiotto, either
1. copy each row's elements and put them all into a new flat array, or
2. concatenate all source value array (and adjust offsets).
The latter is best when there is sharing of elements.
Closes #9342.
---------
Signed-off-by: Alfonso Subiotto Marques <[email protected]>
Co-authored-by: Alfonso Subiotto Marques <[email protected]>
---
arrow-select/src/interleave.rs | 232 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 232 insertions(+)
diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs
index 1bd34a7f3a..13d28747df 100644
--- a/arrow-select/src/interleave.rs
+++ b/arrow-select/src/interleave.rs
@@ -114,6 +114,8 @@ pub fn interleave(
DataType::Int64 => interleave_run_end::<Int64Type>(values,
indices),
t => unreachable!("illegal run-end type {t}"),
},
+ DataType::ListView(field) => interleave_list_view::<i32>(values,
indices, field),
+ DataType::LargeListView(field) => interleave_list_view::<i64>(values,
indices, field),
_ => interleave_fallback(values, indices)
}
}
@@ -482,6 +484,114 @@ fn interleave_run_end<R: RunEndIndexType>(
)?))
}
+fn interleave_list_view<O: OffsetSizeTrait>(
+ values: &[&dyn Array],
+ indices: &[(usize, usize)],
+ field: &FieldRef,
+) -> Result<ArrayRef, ArrowError> {
+ let interleaved = Interleave::<'_, GenericListViewArray<O>>::new(values,
indices);
+
+ // Pick whichever strategy produces fewer child elements:
+ // - Per-row copy: total = sum of selected sizes. Better for sparse
selections.
+ // - Concat + offset adjustment: total = sum of source backing array
lengths.
+ // Better when rows share backing elements via overlapping offset/size
ranges.
+ let concat_cost: usize = interleaved.arrays.iter().map(|lv|
lv.values().len()).sum();
+ let per_row_cost: usize = indices
+ .iter()
+ .map(|&(a, r)| interleaved.arrays[a].sizes()[r].as_usize())
+ .sum();
+
+ if per_row_cost <= concat_cost {
+ interleave_list_view_copy::<O>(&interleaved, indices, field)
+ } else {
+ interleave_list_view_concat::<O>(&interleaved, indices, field)
+ }
+}
+
+/// Per-row copy: copies each selected row's child elements into a new flat
array.
+fn interleave_list_view_copy<O: OffsetSizeTrait>(
+ interleaved: &Interleave<'_, GenericListViewArray<O>>,
+ indices: &[(usize, usize)],
+ field: &FieldRef,
+) -> Result<ArrayRef, ArrowError> {
+ let mut capacity = 0usize;
+ let mut offsets = Vec::with_capacity(indices.len());
+ let mut sizes = Vec::with_capacity(indices.len());
+ for &(array_idx, row_idx) in indices {
+ let list = interleaved.arrays[array_idx];
+ let size = list.sizes()[row_idx].as_usize();
+ offsets.push(
+ O::from_usize(capacity).ok_or_else(||
ArrowError::OffsetOverflowError(capacity))?,
+ );
+ sizes.push(O::from_usize(size).ok_or_else(||
ArrowError::OffsetOverflowError(size))?);
+ capacity += size;
+ }
+
+ let child_data: Vec<_> = interleaved
+ .arrays
+ .iter()
+ .map(|list| list.values().to_data())
+ .collect();
+ let child_data_refs: Vec<_> = child_data.iter().collect();
+ let mut mutable_child = MutableArrayData::new(child_data_refs, false,
capacity);
+ for &(array_idx, row_idx) in indices {
+ let list = interleaved.arrays[array_idx];
+ let start = list.offsets()[row_idx].as_usize();
+ let size = list.sizes()[row_idx].as_usize();
+ if size > 0 {
+ mutable_child.extend(array_idx, start, start + size);
+ }
+ }
+
+ Ok(Arc::new(GenericListViewArray::<O>::new(
+ field.clone(),
+ offsets.into(),
+ sizes.into(),
+ make_array(mutable_child.freeze()),
+ interleaved.nulls.clone(),
+ )))
+}
+
+/// Concat backing arrays: concatenates all source value arrays and adjusts
offsets.
+/// Preserves within-source element sharing.
+fn interleave_list_view_concat<O: OffsetSizeTrait>(
+ interleaved: &Interleave<'_, GenericListViewArray<O>>,
+ indices: &[(usize, usize)],
+ field: &FieldRef,
+) -> Result<ArrayRef, ArrowError> {
+ let child_arrays: Vec<&dyn Array> = interleaved
+ .arrays
+ .iter()
+ .map(|lv| lv.values().as_ref())
+ .collect();
+ let mut base_offsets = Vec::with_capacity(interleaved.arrays.len());
+ let mut running = 0usize;
+ for lv in &interleaved.arrays {
+ base_offsets.push(running);
+ running += lv.values().len();
+ }
+ let combined_values = concat(&child_arrays)?;
+
+ let mut new_offsets = Vec::with_capacity(indices.len());
+ let mut new_sizes = Vec::with_capacity(indices.len());
+ for &(array_idx, row_idx) in indices {
+ let lv = interleaved.arrays[array_idx];
+ let adjusted = lv.offsets()[row_idx].as_usize() +
base_offsets[array_idx];
+ new_offsets.push(
+ O::from_usize(adjusted).ok_or_else(||
ArrowError::OffsetOverflowError(adjusted))?,
+ );
+ new_sizes.push(lv.sizes()[row_idx]);
+ }
+
+ Ok(Arc::new(GenericListViewArray::<O>::new(
+ field.clone(),
+ new_offsets.into(),
+ new_sizes.into(),
+ combined_values,
+ interleaved.nulls.clone(),
+ )))
+}
+
/// Fallback implementation of interleave using [`MutableArrayData`]
fn interleave_fallback(
values: &[&dyn Array],
@@ -841,6 +951,128 @@ mod tests {
test_interleave_lists::<i64>();
}
+ fn test_interleave_list_views<O: OffsetSizeTrait>() {
+ // [[1, 2], null, [3]]
+ let mut a = GenericListBuilder::<O, _>::new(Int32Builder::new());
+ a.values().append_value(1);
+ a.values().append_value(2);
+ a.append(true);
+ a.append(false);
+ a.values().append_value(3);
+ a.append(true);
+ let a: GenericListViewArray<O> = a.finish().into();
+
+ // [[4], null, [5, 6, null]]
+ let mut b = GenericListBuilder::<O, _>::new(Int32Builder::new());
+ b.values().append_value(4);
+ b.append(true);
+ b.append(false);
+ b.values().append_value(5);
+ b.values().append_value(6);
+ b.values().append_null();
+ b.append(true);
+ let b: GenericListViewArray<O> = b.finish().into();
+
+ let values = interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2),
(1, 1)]).unwrap();
+ let v = values
+ .as_any()
+ .downcast_ref::<GenericListViewArray<O>>()
+ .unwrap();
+
+ // [[3], null, [4], [5, 6, null], null]
+ let mut expected = GenericListBuilder::<O,
_>::new(Int32Builder::new());
+ expected.values().append_value(3);
+ expected.append(true);
+ expected.append(false);
+ expected.values().append_value(4);
+ expected.append(true);
+ expected.values().append_value(5);
+ expected.values().append_value(6);
+ expected.values().append_null();
+ expected.append(true);
+ expected.append(false);
+ let expected: GenericListViewArray<O> = expected.finish().into();
+
+ assert_eq!(v, &expected);
+ }
+
+ #[test]
+ fn test_list_views() {
+ test_interleave_list_views::<i32>();
+ }
+
+ #[test]
+ fn test_large_list_views() {
+ test_interleave_list_views::<i64>();
+ }
+
+ #[test]
+ fn test_interleave_list_view_overlapping() {
+ let field = Arc::new(Field::new_list_field(DataType::Int64, false));
+
+ // lv_a: 10 rows, two groups of 5 sharing the same backing elements.
+ // rows 0-4 → offset 0, size 5 → [0,1,2,3,4]
+ // rows 5-9 → offset 5, size 5 → [5,6,7,8,9]
+ let lv_a = ListViewArray::new(
+ Arc::clone(&field),
+ ScalarBuffer::from(vec![0i32, 0, 0, 0, 0, 5, 5, 5, 5, 5]),
+ ScalarBuffer::from(vec![5i32; 10]),
+ Arc::new(Int64Array::from_iter_values(0..10)),
+ None,
+ );
+
+ // lv_b: 8 rows, two groups of 4 sharing the same backing elements.
+ // rows 0-3 → offset 0, size 3 → [100,101,102]
+ // rows 4-7 → offset 3, size 3 → [103,104,105]
+ let lv_b = ListViewArray::new(
+ Arc::clone(&field),
+ ScalarBuffer::from(vec![0i32, 0, 0, 0, 3, 3, 3, 3]),
+ ScalarBuffer::from(vec![3i32; 8]),
+ Arc::new(Int64Array::from_iter_values(100..106)),
+ None,
+ );
+
+ let indices: Vec<(usize, usize)> = vec![
+ (0, 0),
+ (1, 0),
+ (0, 5),
+ (1, 4),
+ (0, 1),
+ (1, 1),
+ (0, 6),
+ (1, 5),
+ ];
+ let result = interleave(&[&lv_a as &dyn Array, &lv_b as &dyn Array],
&indices).unwrap();
+ result
+ .to_data()
+ .validate_full()
+ .expect("result must be valid");
+
+ let result_lv = result.as_list_view::<i32>();
+ assert_eq!(result_lv.len(), 8);
+ assert_eq!(
+ result_lv.value(0).as_primitive::<Int64Type>().values(),
+ &[0, 1, 2, 3, 4]
+ );
+ assert_eq!(
+ result_lv.value(1).as_primitive::<Int64Type>().values(),
+ &[100, 101, 102]
+ );
+ assert_eq!(
+ result_lv.value(2).as_primitive::<Int64Type>().values(),
+ &[5, 6, 7, 8, 9]
+ );
+ assert_eq!(
+ result_lv.value(3).as_primitive::<Int64Type>().values(),
+ &[103, 104, 105]
+ );
+
+ // Backing elements = sum of source arrays (10 + 6 = 16), not per-row
+ // expansion (8 rows × avg ~4 = 32). Overlapping sharing is preserved.
+ let total_input_elements = lv_a.values().len() + lv_b.values().len();
+ assert_eq!(result_lv.values().len(), total_input_elements);
+ }
+
#[test]
fn test_struct_without_nulls() {
let fields = Fields::from(vec![