tustvold commented on code in PR #1720:
URL: https://github.com/apache/arrow-rs/pull/1720#discussion_r878850863
##########
arrow/src/compute/kernels/concat.rs:
##########
@@ -102,6 +102,25 @@ pub fn concat(arrays: &[&dyn Array]) -> Result<ArrayRef> {
Ok(make_array(mutable.freeze()))
}
+// Elementwise concatenation of StringArrays
+pub fn string_concat<Offset: OffsetSizeTrait>(
+ left: &GenericStringArray<Offset>,
+ right: &GenericStringArray<Offset>,
+) -> Result<GenericStringArray<Offset>> {
+ let left_bitmap = left.data().null_bitmap().unwrap();
+ let right_bitmap = right.data().null_bitmap().unwrap();
+ let concat_bitmap = (left_bitmap & right_bitmap).unwrap();
+ Ok((0..left.len().max(right.len()))
+ .map(|i| {
+ if concat_bitmap.is_set(i) {
+ Some(left.value(i).to_owned() + right.value(i))
+ } else {
+ None
+ }
+ })
+ .collect::<GenericStringArray<Offset>>())
Review Comment:
I think it would be **significantly** faster to do something like (not at
all tested)
```
// TODO: Handle non-zero offset in source ArrayData
if left.len() != right.len() {
return Err(...)
}
let nulls = match (left.data().null_bitmap(), right.data.null_bitmap()) {
(Some(left), Some(right)) = Some((left & right)?)
(Some(left), None) => Some(left),
(None, Some(right)) => Some(right),
(None, None) => None,
};
let left_offsets = left.value_offsets();
let right_offsets = right.value_offsets();
let left_values = left.value_data().as_slice();
let right_values = right.value_data().as_slice();
let left_iter = left_offsets.windows(2);
let right_iter = right_offsets.windows(2);
let mut output_offsets = BufferBuilder::<Offset>::new(left_offsets.len());
let mut output_values = BufferBuilder::<u8>::new(left_data.len() +
right_data.len());
output_offsets.append(0);
for (left, right) in left_iter.zip(right_iter) {
output_values.append_slice(&left_values[left[0]..left[1]]);
output_values.append_slice(&right_values[right[0]..right[1]]);)
output_offsets.append(output_values.len()); // With checked cast
}
let mut builder = ArrayDataBuilder::new(Offset::get_data_type)
.len(left.len())
.add_buffer(output_offsets.finish())
.add_buffer(output_values.finish());
if let Some(nulls) = nulls {
builder = builder.null_bit_buffer(nulls);
}
// SAFETY - offsets valid by construction
unsafe {builder.build_unchecked()}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]