alamb commented on code in PR #5922:
URL: https://github.com/apache/arrow-rs/pull/5922#discussion_r1650164445
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -429,6 +432,49 @@ impl<T: ByteViewType + ?Sized> From<ArrayData> for
GenericByteViewArray<T> {
}
}
+impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
+where
+ FROM: ByteArrayType,
+ FROM::Offset: OffsetSizeTrait + ToPrimitive,
+ V: ByteViewType<Native = FROM::Native>,
+{
+ fn from(byte_array: &GenericByteArray<FROM>) -> Self {
+ let offsets = byte_array.offsets();
+
+ let can_reuse_buffer = match offsets.last() {
Review Comment:
nice
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -429,6 +432,49 @@ impl<T: ByteViewType + ?Sized> From<ArrayData> for
GenericByteViewArray<T> {
}
}
+impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
Review Comment:
maybe it is worth some comments explaining what this does (very cleverly and
quickly converts a ByteArray to ByteViewArray)
##########
arrow-row/src/lib.rs:
##########
@@ -1255,11 +1270,13 @@ unsafe fn decode_column(
DataType::Boolean => Arc::new(decode_bool(rows, options)),
DataType::Binary => Arc::new(decode_binary::<i32>(rows,
options)),
DataType::LargeBinary => Arc::new(decode_binary::<i64>(rows,
options)),
+ DataType::BinaryView => Arc::new(decode_binary_view(rows,
options)),
DataType::FixedSizeBinary(size) =>
Arc::new(decode_fixed_size_binary(rows, size, options)),
DataType::Utf8 => Arc::new(decode_string::<i32>(rows, options,
validate_utf8)),
DataType::LargeUtf8 => Arc::new(decode_string::<i64>(rows,
options, validate_utf8)),
+ DataType::Utf8View => Arc::new(decode_string_view(rows,
options, validate_utf8)),
DataType::Dictionary(_, _) => todo!(),
- _ => unreachable!()
+ _ => unimplemented!("unsupported data type: {}", data_type),
Review Comment:
If would be nicer if this was an actual errr -- like `return
ArrowError::Unimplemented(format!("unsupported data type: {}", data_type))`
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -429,6 +432,49 @@ impl<T: ByteViewType + ?Sized> From<ArrayData> for
GenericByteViewArray<T> {
}
}
+impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
+where
+ FROM: ByteArrayType,
+ FROM::Offset: OffsetSizeTrait + ToPrimitive,
+ V: ByteViewType<Native = FROM::Native>,
+{
+ fn from(byte_array: &GenericByteArray<FROM>) -> Self {
+ let offsets = byte_array.offsets();
+
+ let can_reuse_buffer = match offsets.last() {
+ Some(offset) => offset.as_usize() < u32::MAX as usize,
+ None => true,
+ };
+
+ if can_reuse_buffer {
+ let len = byte_array.len();
+ let mut views_builder =
GenericByteViewBuilder::<V>::with_capacity(len);
+ let str_values_buf = byte_array.values().clone();
+ let block = views_builder.append_block(str_values_buf);
+ for (i, w) in offsets.windows(2).enumerate() {
+ let offset = w[0].as_usize();
+ let end = w[1].as_usize();
+ let length = end - offset;
+
+ if byte_array.is_null(i) {
+ views_builder.append_null();
+ } else {
+ // Safety: the input was a valid array so it valid UTF8
(if string). And
+ // all offsets were valid and we created the views
correctly
Review Comment:
I think validity is due to the offsets being correct - this loop is creating
the views
```suggestion
// all offsets were valid
```
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -2339,31 +2339,11 @@ fn cast_byte_to_view<FROM, V>(array: &dyn Array) ->
Result<ArrayRef, ArrowError>
where
FROM: ByteArrayType,
FROM::Offset: OffsetSizeTrait + ToPrimitive,
Review Comment:
Maybe we don't need the function anymore 🤔
##########
arrow-row/src/variable.rs:
##########
@@ -268,3 +268,30 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>(
// Row data must have come from a valid UTF-8 array
GenericStringArray::from(builder.build_unchecked())
}
+
+/// Decodes a binary view array from `rows` with the provided `options`
+pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) ->
BinaryViewArray {
+ let decoded: GenericBinaryArray<i64> = decode_binary(rows, options);
+
+ // Better performance might be to directly build the binary view instead
of building to BinaryArray and then casting
+ // I suspect that the overhead is not a big deal.
Review Comment:
I agree. However, I think this is a good first implementation think we can
do it as a follow on PR: https://github.com/apache/arrow-rs/issues/5945
##########
arrow-ord/src/cmp.rs:
##########
@@ -663,6 +597,78 @@ impl<'a> ArrayOrd for &'a FixedSizeBinaryArray {
}
}
+/// Comparing two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
Review Comment:
```suggestion
/// Compares two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]