jhorstmann commented on code in PR #1499:
URL: https://github.com/apache/arrow-rs/pull/1499#discussion_r848554356
##########
arrow/src/compute/kernels/boolean.rs:
##########
@@ -575,10 +577,242 @@ where
Ok(PrimitiveArray::<T>::from(data))
}
+/// Creates a (mostly) zero-copy slice of the given buffers so that they can
be combined
+/// in the same array with other buffers that start at offset 0.
+/// The only buffers that need an actual copy are booleans (if they are not
byte-aligned)
+/// and list/binary/string offsets because the arrow implementation requires
them to start at 0.
+/// This is useful when a kernel calculates a new validity bitmap but wants to
reuse other buffers.
+fn slice_buffers(
+ buffers: &[Buffer],
+ offset: usize,
+ len: usize,
+ data_type: &DataType,
+ child_data: &[ArrayData],
+) -> (Vec<Buffer>, Vec<ArrayData>) {
+ use std::mem::size_of;
+
+ if offset == 0 {
+ return (buffers.to_vec(), child_data.to_vec());
+ }
+
+ // we only need to do something special to child data in 2 of the match
branches
+ let mut result_child_data = None;
+
+ let result_buffers = match data_type {
+ DataType::Boolean => vec![buffers[0].bit_slice(offset, len)],
+ DataType::Int8 | DataType::UInt8 => {
+ vec![buffers[0].slice(offset * size_of::<u8>())]
+ }
+ DataType::Int16 | DataType::UInt16 | DataType::Float16 => {
+ vec![buffers[0].slice(offset * size_of::<u16>())]
+ }
+ DataType::Int32 | DataType::UInt32 | DataType::Float32 => {
+ vec![buffers[0].slice(offset * size_of::<u32>())]
+ }
+ DataType::Int64 | DataType::UInt64 | DataType::Float64 => {
+ vec![buffers[0].slice(offset * size_of::<u64>())]
+ }
+ DataType::Timestamp(_, _) | DataType::Duration(_) => {
+ vec![buffers[0].slice(offset * size_of::<u64>())]
+ }
+ DataType::Date32 | DataType::Time32(_) => {
+ vec![buffers[0].slice(offset * size_of::<u32>())]
+ }
+ DataType::Date64 | DataType::Time64(_) => {
+ vec![buffers[0].slice(offset * size_of::<u64>())]
+ }
+ DataType::Interval(IntervalUnit::YearMonth) => {
+ vec![buffers[0].slice(offset * size_of::<u32>())]
+ }
+ DataType::Interval(IntervalUnit::DayTime) => {
+ vec![buffers[0].slice(offset * 2 * size_of::<u32>())]
+ }
+ DataType::Interval(IntervalUnit::MonthDayNano) => {
+ vec![buffers[0].slice(offset * (2 * size_of::<u32>() +
size_of::<u64>()))]
+ }
+ DataType::Decimal(_, _) => vec![buffers[0].slice(offset *
size_of::<i128>())],
+ DataType::Dictionary(key_type, _) => match key_type.as_ref() {
+ DataType::Int8 | DataType::UInt8 => {
+ vec![buffers[0].slice(offset * size_of::<u8>())]
+ }
+ DataType::Int16 | DataType::UInt16 => {
+ vec![buffers[0].slice(offset * size_of::<u16>())]
+ }
+ DataType::Int32 | DataType::UInt32 => {
+ vec![buffers[0].slice(offset * size_of::<u32>())]
+ }
+ DataType::Int64 | DataType::UInt64 => {
+ vec![buffers[0].slice(offset * size_of::<u64>())]
+ }
+ _ => unreachable!(),
+ },
+ DataType::List(_) => {
+ // safe because for List the first buffer is guaranteed to contain
i32 offsets
+ let offsets = unsafe { &buffers[0].typed_data()[offset..] };
+ let first_offset = offsets[0] as usize;
+ let last_offset = offsets[len] as usize;
+ let nested_len = last_offset - first_offset;
+
+ // since we calculate a new offset buffer starting from 0 we also
have to slice the child data
+ result_child_data = Some(
+ child_data
+ .iter()
+ .map(|d| d.slice(first_offset, nested_len))
+ .collect(),
+ );
+ vec![offset_buffer_slice::<i32>(offsets, len)]
+ }
+ DataType::LargeList(_) => {
+ // safe because for LargeList the first buffer is guaranteed to
contain i64 offsets
+ let offsets = unsafe { &buffers[0].typed_data()[offset..] };
+ let first_offset = offsets[0] as usize;
+ let last_offset = offsets[len] as usize;
+ let nested_len = last_offset - first_offset;
+ // since we calculate a new offset buffer starting from 0 we also
have to slice the child data
+
+ result_child_data = Some(
+ child_data
+ .iter()
+ .map(|d| d.slice(first_offset, nested_len))
+ .collect(),
+ );
+ vec![offset_buffer_slice::<i64>(offsets, len)]
+ }
+ DataType::Binary | DataType::Utf8 => {
+ // safe because for Binary/Utf8 the first buffer is guaranteed to
contain i32 offsets
+ let offsets = unsafe { &buffers[0].typed_data()[offset..] };
+ let first_offset = offsets[0] as usize;
+ vec![
+ offset_buffer_slice::<i32>(offsets, len),
+ buffers[1].slice(first_offset * size_of::<i32>()),
+ ]
+ }
+ DataType::LargeBinary | DataType::LargeUtf8 => {
+ // safe because for LargeBinary/LargeUtf8 the first buffer is
guaranteed to contain i64 offsets
+ let offsets = unsafe { &buffers[0].typed_data()[offset..] };
+ let first_offset = offsets[0] as usize;
+ vec![
+ offset_buffer_slice::<i64>(offsets, len),
+ buffers[1].slice(first_offset * size_of::<i64>()),
+ ]
+ }
+ DataType::FixedSizeBinary(size) => {
+ vec![buffers[0].slice(offset * (*size as usize))]
+ }
+ DataType::FixedSizeList(_, _size) => {
+ // TODO: should this actually slice the child arrays?
+ vec![]
Review Comment:
Sorry, I know this logic is a bit confusing. For most types we only need to
slice the directly contained buffers, so returning these new buffers is
basically the happy path here. Struct or FixedSizeList layouts don't directly
contain buffers, but instead the data is stored in child arrays which need to
be sliced.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]