tustvold commented on code in PR #6779:
URL: https://github.com/apache/arrow-rs/pull/6779#discussion_r1858180588


##########
arrow-select/src/interleave.rs:
##########
@@ -461,4 +498,125 @@ mod tests {
             DictionaryArray::<Int32Type>::from_iter(vec![Some("0"), Some("1"), 
Some("2"), None]);
         assert_eq!(array.as_ref(), &expected)
     }
+
+    #[test]
+    fn test_interleave_views() {
+        let values = StringArray::from_iter_values([
+            "hello",
+            "world_long_string_not_inlined",
+            "foo",
+            "bar",
+            "baz",
+        ]);
+        let view_a = StringViewArray::from(&values);
+
+        let values = StringArray::from_iter_values([
+            "test",
+            "data",
+            "more_long_string_not_inlined",
+            "views",
+            "here",
+        ]);
+        let view_b = StringViewArray::from(&values);
+
+        let indices = &[
+            (0, 2), // "foo"
+            (1, 0), // "test"
+            (0, 4), // "baz"
+            (1, 3), // "views"
+            (0, 1), // "world_long_string_not_inlined"
+        ];
+
+        // Test specialized implementation
+        let values = interleave(&[&view_a, &view_b], indices).unwrap();
+        let result = values.as_string_view();
+        assert_eq!(result.data_buffers().len(), 1);
+
+        // Test fallback implementation
+        let fallback = interleave_fallback(&[&view_a, &view_b], 
indices).unwrap();
+        let fallback_result = fallback.as_string_view();
+        // as of commit 97055631, assertion below, commented out to not block 
future improvements, passes:
+        // note that fallback_result has 2 buffers, but only one long enough 
string to warrant a buffer
+        // assert_eq!(fallback_result.data_buffers().len(), 2);

Review Comment:
   ```suggestion
           assert_eq!(fallback_result.data_buffers().len(), 2);
   ```
   
   Ultimately if this starts failing it indicates improvements have been made 
that might make the specialized impl redundant



##########
arrow-select/src/interleave.rs:
##########
@@ -231,6 +235,39 @@ fn interleave_dictionaries<K: ArrowDictionaryKeyType>(
     Ok(Arc::new(array))
 }
 
+fn interleave_views<T: ByteViewType>(
+    values: &[&dyn Array],
+    indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+    let interleaved = Interleave::<'_, GenericByteViewArray<T>>::new(values, 
indices);
+    let mut views_builder = BufferBuilder::new(indices.len());
+    let mut buffers = Vec::with_capacity(values[0].len());

Review Comment:
   Why is this the capacity?



##########
arrow-select/src/interleave.rs:
##########
@@ -231,6 +235,39 @@ fn interleave_dictionaries<K: ArrowDictionaryKeyType>(
     Ok(Arc::new(array))
 }
 
+fn interleave_views<T: ByteViewType>(
+    values: &[&dyn Array],
+    indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+    let interleaved = Interleave::<'_, GenericByteViewArray<T>>::new(values, 
indices);
+    let mut views_builder = BufferBuilder::new(indices.len());
+    let mut buffers = Vec::with_capacity(values[0].len());
+
+    let mut buffer_lookup = HashMap::new();

Review Comment:
   This misunderstands https://github.com/apache/arrow-rs/issues/6780, the 
issue isn't not removing buffers, it is that the arrays being interleaved may 
contain the same actual buffers, e.g. sourced from the same parquet dictionary 
page.



##########
arrow-select/src/interleave.rs:
##########
@@ -231,6 +239,39 @@ fn interleave_dictionaries<K: ArrowDictionaryKeyType>(
     Ok(Arc::new(array))
 }
 
+fn interleave_views<T: ByteViewType>(
+    values: &[&dyn Array],
+    indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+    let interleaved = Interleave::<'_, GenericByteViewArray<T>>::new(values, 
indices);
+    let mut views_builder = BufferBuilder::new(indices.len());
+    let mut buffers = Vec::with_capacity(values[0].len());
+
+    let mut buffer_lookup = HashMap::new();
+    for (array_idx, value_idx) in indices {
+        let array = interleaved.arrays[*array_idx];
+        let raw_view = array.views().get(*value_idx).unwrap();
+        let view = ByteView::from(*raw_view);
+
+        if view.length <= 12 {
+            views_builder.append(*raw_view);
+            continue;
+        }
+        // value is big enough to be in a variadic buffer
+        let new_buffer_idx: &mut u32 = buffer_lookup
+            .entry((*array_idx, view.buffer_index))
+            .or_insert_with(|| {
+                buffers.push(array.data_buffers()[view.buffer_index as 
usize].clone());
+                (buffers.len() - 1) as u32
+            });
+        views_builder.append(view.with_buffer_index(*new_buffer_idx).into());
+    }
+
+    let array =
+        GenericByteViewArray::<T>::try_new(views_builder.into(), buffers, 
interleaved.nulls)?;

Review Comment:
   Yes probably



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to