tustvold commented on code in PR #6779:
URL: https://github.com/apache/arrow-rs/pull/6779#discussion_r1858180588
##########
arrow-select/src/interleave.rs:
##########
@@ -461,4 +498,125 @@ mod tests {
DictionaryArray::<Int32Type>::from_iter(vec![Some("0"), Some("1"),
Some("2"), None]);
assert_eq!(array.as_ref(), &expected)
}
+
+ #[test]
+ fn test_interleave_views() {
+ let values = StringArray::from_iter_values([
+ "hello",
+ "world_long_string_not_inlined",
+ "foo",
+ "bar",
+ "baz",
+ ]);
+ let view_a = StringViewArray::from(&values);
+
+ let values = StringArray::from_iter_values([
+ "test",
+ "data",
+ "more_long_string_not_inlined",
+ "views",
+ "here",
+ ]);
+ let view_b = StringViewArray::from(&values);
+
+ let indices = &[
+ (0, 2), // "foo"
+ (1, 0), // "test"
+ (0, 4), // "baz"
+ (1, 3), // "views"
+ (0, 1), // "world_long_string_not_inlined"
+ ];
+
+ // Test specialized implementation
+ let values = interleave(&[&view_a, &view_b], indices).unwrap();
+ let result = values.as_string_view();
+ assert_eq!(result.data_buffers().len(), 1);
+
+ // Test fallback implementation
+ let fallback = interleave_fallback(&[&view_a, &view_b],
indices).unwrap();
+ let fallback_result = fallback.as_string_view();
+ // as of commit 97055631, assertion below, commented out to not block
future improvements, passes:
+ // note that fallback_result has 2 buffers, but only one long enough
string to warrant a buffer
+ // assert_eq!(fallback_result.data_buffers().len(), 2);
Review Comment:
```suggestion
assert_eq!(fallback_result.data_buffers().len(), 2);
```
Ultimately if this starts failing it indicates improvements have been made
that might make the specialized impl redundant
##########
arrow-select/src/interleave.rs:
##########
@@ -231,6 +235,39 @@ fn interleave_dictionaries<K: ArrowDictionaryKeyType>(
Ok(Arc::new(array))
}
+fn interleave_views<T: ByteViewType>(
+ values: &[&dyn Array],
+ indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+ let interleaved = Interleave::<'_, GenericByteViewArray<T>>::new(values,
indices);
+ let mut views_builder = BufferBuilder::new(indices.len());
+ let mut buffers = Vec::with_capacity(values[0].len());
Review Comment:
Why is this the capacity?
##########
arrow-select/src/interleave.rs:
##########
@@ -231,6 +235,39 @@ fn interleave_dictionaries<K: ArrowDictionaryKeyType>(
Ok(Arc::new(array))
}
+fn interleave_views<T: ByteViewType>(
+ values: &[&dyn Array],
+ indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+ let interleaved = Interleave::<'_, GenericByteViewArray<T>>::new(values,
indices);
+ let mut views_builder = BufferBuilder::new(indices.len());
+ let mut buffers = Vec::with_capacity(values[0].len());
+
+ let mut buffer_lookup = HashMap::new();
Review Comment:
This misunderstands https://github.com/apache/arrow-rs/issues/6780, the
issue isn't not removing buffers, it is that the arrays being interleaved may
contain the same actual buffers, e.g. sourced from the same parquet dictionary
page.
##########
arrow-select/src/interleave.rs:
##########
@@ -231,6 +239,39 @@ fn interleave_dictionaries<K: ArrowDictionaryKeyType>(
Ok(Arc::new(array))
}
+fn interleave_views<T: ByteViewType>(
+ values: &[&dyn Array],
+ indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+ let interleaved = Interleave::<'_, GenericByteViewArray<T>>::new(values,
indices);
+ let mut views_builder = BufferBuilder::new(indices.len());
+ let mut buffers = Vec::with_capacity(values[0].len());
+
+ let mut buffer_lookup = HashMap::new();
+ for (array_idx, value_idx) in indices {
+ let array = interleaved.arrays[*array_idx];
+ let raw_view = array.views().get(*value_idx).unwrap();
+ let view = ByteView::from(*raw_view);
+
+ if view.length <= 12 {
+ views_builder.append(*raw_view);
+ continue;
+ }
+ // value is big enough to be in a variadic buffer
+ let new_buffer_idx: &mut u32 = buffer_lookup
+ .entry((*array_idx, view.buffer_index))
+ .or_insert_with(|| {
+ buffers.push(array.data_buffers()[view.buffer_index as
usize].clone());
+ (buffers.len() - 1) as u32
+ });
+ views_builder.append(view.with_buffer_index(*new_buffer_idx).into());
+ }
+
+ let array =
+ GenericByteViewArray::<T>::try_new(views_builder.into(), buffers,
interleaved.nulls)?;
Review Comment:
Yes probably
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]