This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e8285bea0ec clean up ByteView construction (#5879)
e8285bea0ec is described below
commit e8285bea0eca99316590fcefc8816e1df6581a5c
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Thu Jun 13 16:09:33 2024 -0400
clean up ByteView construction (#5879)
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-cast/src/cast/mod.rs | 44 +++++++++----------------------
parquet/src/arrow/buffer/offset_buffer.rs | 7 ++---
2 files changed, 17 insertions(+), 34 deletions(-)
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 354c31af695..55f2ed72836 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -46,7 +46,7 @@ use crate::cast::dictionary::*;
use crate::cast::list::*;
use crate::cast::string::*;
-use arrow_buffer::{IntervalMonthDayNano, ScalarBuffer};
+use arrow_buffer::IntervalMonthDayNano;
use arrow_data::ByteView;
use chrono::{NaiveTime, Offset, TimeZone, Utc};
use std::cmp::Ordering;
@@ -2341,47 +2341,29 @@ where
FROM::Offset: OffsetSizeTrait + ToPrimitive,
V: ByteViewType,
{
- let data = array.to_data();
- assert_eq!(data.data_type(), &FROM::DATA_TYPE);
-
+ let byte_array: &GenericByteArray<FROM> = array.as_bytes();
let len = array.len();
- let str_values_buf = data.buffers()[1].clone();
- let offsets = data.buffers()[0].typed_data::<FROM::Offset>();
+ let str_values_buf = byte_array.values().clone();
+ let offsets = byte_array.offsets();
- let mut views_builder = BufferBuilder::<u128>::new(len);
- for w in offsets.windows(2) {
+ let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
+ let block = views_builder.append_block(str_values_buf);
+ for (i, w) in offsets.windows(2).enumerate() {
let offset = w[0].to_u32().unwrap();
let end = w[1].to_u32().unwrap();
- let value_buf = &str_values_buf[offset as usize..end as usize];
let length = end - offset;
- if length <= 12 {
- let mut view_buffer = [0; 16];
- view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
- view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf);
- views_builder.append(u128::from_le_bytes(view_buffer));
+ if byte_array.is_null(i) {
+ views_builder.append_null();
} else {
- let view = ByteView {
- length,
- prefix:
u32::from_le_bytes(value_buf[0..4].try_into().unwrap()),
- buffer_index: 0,
- offset,
- };
- views_builder.append(view.into());
+ // Safety: the input was a valid array so it valid UTF8 (if
string). And
+ // all offsets were valid and we created the views correctly
+ unsafe { views_builder.append_view_unchecked(block, offset,
length) }
}
}
assert_eq!(views_builder.len(), len);
-
- // Safety: the input was a valid array so it valid UTF8 (if string). And
- // all offsets were valid and we created the views correctly
- Ok(Arc::new(unsafe {
- GenericByteViewArray::<V>::new_unchecked(
- ScalarBuffer::new(views_builder.finish(), 0, len),
- vec![str_values_buf],
- data.nulls().cloned(),
- )
- }))
+ Ok(Arc::new(views_builder.finish()))
}
/// Helper function to cast from one `ByteViewType` array to `ByteArrayType`
array.
diff --git a/parquet/src/arrow/buffer/offset_buffer.rs
b/parquet/src/arrow/buffer/offset_buffer.rs
index 181e69c669a..806f144d966 100644
--- a/parquet/src/arrow/buffer/offset_buffer.rs
+++ b/parquet/src/arrow/buffer/offset_buffer.rs
@@ -164,9 +164,10 @@ impl<I: OffsetSizeTrait> OffsetBuffer<I> {
let len = (end - start).to_usize().unwrap();
if len != 0 {
- builder
- .try_append_view(block, start.as_usize() as u32, len as
u32)
- .unwrap();
+ // Safety: (1) the buffer is valid (2) the offsets are valid
(3) the values in between are of ByteViewType
+ unsafe {
+ builder.append_view_unchecked(block, start.as_usize() as
u32, len as u32);
+ }
} else {
builder.append_null();
}