This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new af777cd53e Add support `StringView` / `BinaryView` in `interleave` 
kernel (#6779)
af777cd53e is described below

commit af777cd53e56f8382382137b6e08af249c475397
Author: Onur Satici <[email protected]>
AuthorDate: Fri Jan 17 23:02:22 2025 +0100

    Add support `StringView` / `BinaryView` in `interleave` kernel (#6779)
    
    * add byteview specific interleave
    
    * clippy
    
    * test
    
    * more clippy
    
    * more test coverage
    
    * enable assertion, remove explicit vector capacity
    
    * add new test, address comments
---
 arrow-select/src/interleave.rs | 244 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 244 insertions(+)

diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs
index 4a47017b79..3557bda8f4 100644
--- a/arrow-select/src/interleave.rs
+++ b/arrow-select/src/interleave.rs
@@ -24,7 +24,9 @@ use arrow_array::types::*;
 use arrow_array::*;
 use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer, 
NullBufferBuilder, OffsetBuffer};
 use arrow_data::transform::MutableArrayData;
+use arrow_data::ByteView;
 use arrow_schema::{ArrowError, DataType};
+use std::collections::HashMap;
 use std::sync::Arc;
 
 macro_rules! primitive_helper {
@@ -97,6 +99,8 @@ pub fn interleave(
         DataType::LargeUtf8 => interleave_bytes::<LargeUtf8Type>(values, 
indices),
         DataType::Binary => interleave_bytes::<BinaryType>(values, indices),
         DataType::LargeBinary => interleave_bytes::<LargeBinaryType>(values, 
indices),
+        DataType::BinaryView => interleave_views::<BinaryViewType>(values, 
indices),
+        DataType::Utf8View => interleave_views::<StringViewType>(values, 
indices),
         DataType::Dictionary(k, _) => downcast_integer! {
             k.as_ref() => (dict_helper, values, indices),
             _ => unreachable!("illegal dictionary key type {k}")
@@ -231,6 +235,41 @@ fn interleave_dictionaries<K: ArrowDictionaryKeyType>(
     Ok(Arc::new(array))
 }
 
+fn interleave_views<T: ByteViewType>(
+    values: &[&dyn Array],
+    indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+    let interleaved = Interleave::<'_, GenericByteViewArray<T>>::new(values, 
indices);
+    let mut views_builder = BufferBuilder::new(indices.len());
+    let mut buffers = Vec::new();
+
+    // (input array_index, input buffer_index) -> output buffer_index
+    let mut buffer_lookup: HashMap<(usize, u32), u32> = HashMap::new();
+    for (array_idx, value_idx) in indices {
+        let array = interleaved.arrays[*array_idx];
+        let raw_view = array.views().get(*value_idx).unwrap();
+        let view_len = *raw_view as u32;
+        if view_len <= 12 {
+            views_builder.append(*raw_view);
+            continue;
+        }
+        // value is big enough to be in a variadic buffer
+        let view = ByteView::from(*raw_view);
+        let new_buffer_idx: &mut u32 = buffer_lookup
+            .entry((*array_idx, view.buffer_index))
+            .or_insert_with(|| {
+                buffers.push(array.data_buffers()[view.buffer_index as 
usize].clone());
+                (buffers.len() - 1) as u32
+            });
+        views_builder.append(view.with_buffer_index(*new_buffer_idx).into());
+    }
+
+    let array = unsafe {
+        GenericByteViewArray::<T>::new_unchecked(views_builder.into(), 
buffers, interleaved.nulls)
+    };
+    Ok(Arc::new(array))
+}
+
 /// Fallback implementation of interleave using [`MutableArrayData`]
 fn interleave_fallback(
     values: &[&dyn Array],
@@ -461,4 +500,209 @@ mod tests {
             DictionaryArray::<Int32Type>::from_iter(vec![Some("0"), Some("1"), 
Some("2"), None]);
         assert_eq!(array.as_ref(), &expected)
     }
+
+    #[test]
+    fn test_interleave_views() {
+        let values = StringArray::from_iter_values([
+            "hello",
+            "world_long_string_not_inlined",
+            "foo",
+            "bar",
+            "baz",
+        ]);
+        let view_a = StringViewArray::from(&values);
+
+        let values = StringArray::from_iter_values([
+            "test",
+            "data",
+            "more_long_string_not_inlined",
+            "views",
+            "here",
+        ]);
+        let view_b = StringViewArray::from(&values);
+
+        let indices = &[
+            (0, 2), // "foo"
+            (1, 0), // "test"
+            (0, 4), // "baz"
+            (1, 3), // "views"
+            (0, 1), // "world_long_string_not_inlined"
+        ];
+
+        // Test specialized implementation
+        let values = interleave(&[&view_a, &view_b], indices).unwrap();
+        let result = values.as_string_view();
+        assert_eq!(result.data_buffers().len(), 1);
+
+        let fallback = interleave_fallback(&[&view_a, &view_b], 
indices).unwrap();
+        let fallback_result = fallback.as_string_view();
+        // note that fallback_result has 2 buffers, but only one long enough 
string to warrant a buffer
+        assert_eq!(fallback_result.data_buffers().len(), 2);
+
+        // Convert to strings for easier assertion
+        let collected: Vec<_> = result.iter().map(|x| x.map(|s| 
s.to_string())).collect();
+
+        let fallback_collected: Vec<_> = fallback_result
+            .iter()
+            .map(|x| x.map(|s| s.to_string()))
+            .collect();
+
+        assert_eq!(&collected, &fallback_collected);
+
+        assert_eq!(
+            &collected,
+            &[
+                Some("foo".to_string()),
+                Some("test".to_string()),
+                Some("baz".to_string()),
+                Some("views".to_string()),
+                Some("world_long_string_not_inlined".to_string()),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_interleave_views_with_nulls() {
+        let values = StringArray::from_iter([
+            Some("hello"),
+            None,
+            Some("foo_long_string_not_inlined"),
+            Some("bar"),
+            None,
+        ]);
+        let view_a = StringViewArray::from(&values);
+
+        let values = StringArray::from_iter([
+            Some("test"),
+            Some("data_long_string_not_inlined"),
+            None,
+            None,
+            Some("here"),
+        ]);
+        let view_b = StringViewArray::from(&values);
+
+        let indices = &[
+            (0, 1), // null
+            (1, 2), // null
+            (0, 2), // "foo_long_string_not_inlined"
+            (1, 3), // null
+            (0, 4), // null
+        ];
+
+        // Test specialized implementation
+        let values = interleave(&[&view_a, &view_b], indices).unwrap();
+        let result = values.as_string_view();
+        assert_eq!(result.data_buffers().len(), 1);
+
+        let fallback = interleave_fallback(&[&view_a, &view_b], 
indices).unwrap();
+        let fallback_result = fallback.as_string_view();
+
+        // Convert to strings for easier assertion
+        let collected: Vec<_> = result.iter().map(|x| x.map(|s| 
s.to_string())).collect();
+
+        let fallback_collected: Vec<_> = fallback_result
+            .iter()
+            .map(|x| x.map(|s| s.to_string()))
+            .collect();
+
+        assert_eq!(&collected, &fallback_collected);
+
+        assert_eq!(
+            &collected,
+            &[
+                None,
+                None,
+                Some("foo_long_string_not_inlined".to_string()),
+                None,
+                None,
+            ]
+        );
+    }
+
+    #[test]
+    fn test_interleave_views_multiple_buffers() {
+        let str1 = "very_long_string_from_first_buffer".as_bytes();
+        let str2 = "very_long_string_from_second_buffer".as_bytes();
+        let buffer1 = str1.to_vec().into();
+        let buffer2 = str2.to_vec().into();
+
+        let view1 = ByteView::new(str1.len() as u32, &str1[..4])
+            .with_buffer_index(0)
+            .with_offset(0)
+            .as_u128();
+        let view2 = ByteView::new(str2.len() as u32, &str2[..4])
+            .with_buffer_index(1)
+            .with_offset(0)
+            .as_u128();
+        let view_a =
+            StringViewArray::try_new(vec![view1, view2].into(), vec![buffer1, 
buffer2], None)
+                .unwrap();
+
+        let str3 = "another_very_long_string_buffer_three".as_bytes();
+        let str4 = "different_long_string_in_buffer_four".as_bytes();
+        let buffer3 = str3.to_vec().into();
+        let buffer4 = str4.to_vec().into();
+
+        let view3 = ByteView::new(str3.len() as u32, &str3[..4])
+            .with_buffer_index(0)
+            .with_offset(0)
+            .as_u128();
+        let view4 = ByteView::new(str4.len() as u32, &str4[..4])
+            .with_buffer_index(1)
+            .with_offset(0)
+            .as_u128();
+        let view_b =
+            StringViewArray::try_new(vec![view3, view4].into(), vec![buffer3, 
buffer4], None)
+                .unwrap();
+
+        let indices = &[
+            (0, 0), // String from first buffer of array A
+            (1, 0), // String from first buffer of array B
+            (0, 1), // String from second buffer of array A
+            (1, 1), // String from second buffer of array B
+            (0, 0), // String from first buffer of array A again
+            (1, 1), // String from second buffer of array B again
+        ];
+
+        // Test interleave
+        let values = interleave(&[&view_a, &view_b], indices).unwrap();
+        let result = values.as_string_view();
+
+        assert_eq!(
+            result.data_buffers().len(),
+            4,
+            "Expected four buffers (two from each input array)"
+        );
+
+        let result_strings: Vec<_> = result.iter().map(|x| x.map(|s| 
s.to_string())).collect();
+        assert_eq!(
+            result_strings,
+            vec![
+                Some("very_long_string_from_first_buffer".to_string()),
+                Some("another_very_long_string_buffer_three".to_string()),
+                Some("very_long_string_from_second_buffer".to_string()),
+                Some("different_long_string_in_buffer_four".to_string()),
+                Some("very_long_string_from_first_buffer".to_string()),
+                Some("different_long_string_in_buffer_four".to_string()),
+            ]
+        );
+
+        let views = result.views();
+        let buffer_indices: Vec<_> = views
+            .iter()
+            .map(|raw_view| ByteView::from(*raw_view).buffer_index)
+            .collect();
+
+        assert_eq!(
+            buffer_indices,
+            vec![
+                0, // First buffer from array A
+                1, // First buffer from array B
+                2, // Second buffer from array A
+                3, // Second buffer from array B
+                0, // First buffer from array A (reused)
+                3, // Second buffer from array B (reused)
+            ]
+        );
+    }
 }

Reply via email to