jhorstmann commented on a change in pull request #1499:
URL: https://github.com/apache/arrow-rs/pull/1499#discussion_r840416500
##########
File path: arrow/src/compute/kernels/boolean.rs
##########
@@ -575,10 +577,241 @@ where
Ok(PrimitiveArray::<T>::from(data))
}
+/// Creates a (mostly) zero-copy slice of the given buffers so that they can
be combined
+/// in the same array with other buffers that start at offset 0.
+/// The only buffers that need an actual copy are booleans and validity (if
they are not byte-aligned).
+/// This is useful when a kernel calculates a new validity bitmap but wants to
reuse other buffers.
+fn slice_buffers(
+ buffers: &[Buffer],
+ offset: usize,
+ len: usize,
+ data_type: &DataType,
+ child_data: &[ArrayData],
+) -> (Vec<Buffer>, Vec<ArrayData>) {
+ use std::mem::size_of;
+
+ if offset == 0 {
+ return (buffers.to_vec(), child_data.to_vec());
+ }
+
+ // we only need to do something special to child data in 2 of the match
branches
+ let mut result_child_data = None;
+
+ let result_buffers = match data_type {
+ DataType::Boolean => vec![buffers[0].bit_slice(offset, len)],
+ DataType::Int8 | DataType::UInt8 => {
+ vec![buffers[0].slice(offset * size_of::<u8>())]
+ }
+ DataType::Int16 | DataType::UInt16 | DataType::Float16 => {
+ vec![buffers[0].slice(offset * size_of::<u16>())]
+ }
+ DataType::Int32 | DataType::UInt32 | DataType::Float32 => {
+ vec![buffers[0].slice(offset * size_of::<u32>())]
+ }
+ DataType::Int64 | DataType::UInt64 | DataType::Float64 => {
+ vec![buffers[0].slice(offset * size_of::<u64>())]
+ }
+ DataType::Timestamp(_, _) | DataType::Duration(_) => {
+ vec![buffers[0].slice(offset * size_of::<u64>())]
+ }
+ DataType::Date32 | DataType::Time32(_) => {
+ vec![buffers[0].slice(offset * size_of::<u32>())]
+ }
+ DataType::Date64 | DataType::Time64(_) => {
+ vec![buffers[0].slice(offset * size_of::<u64>())]
+ }
+ DataType::Interval(IntervalUnit::YearMonth) => {
+ vec![buffers[0].slice(offset * size_of::<u32>())]
+ }
+ DataType::Interval(IntervalUnit::DayTime) => {
+ vec![buffers[0].slice(offset * 2 * size_of::<u32>())]
+ }
+ DataType::Interval(IntervalUnit::MonthDayNano) => {
+ vec![buffers[0].slice(offset * (2 * size_of::<u32>() +
size_of::<u64>()))]
+ }
+ DataType::Decimal(_, _) => vec![buffers[0].slice(offset *
size_of::<i128>())],
+ DataType::Dictionary(key_type, _) => match key_type.as_ref() {
+ DataType::Int8 | DataType::UInt8 => {
+ vec![buffers[0].slice(offset * size_of::<u8>())]
+ }
+ DataType::Int16 | DataType::UInt16 => {
+ vec![buffers[0].slice(offset * size_of::<u16>())]
+ }
+ DataType::Int32 | DataType::UInt32 => {
+ vec![buffers[0].slice(offset * size_of::<u32>())]
+ }
+ DataType::Int64 | DataType::UInt64 => {
+ vec![buffers[0].slice(offset * size_of::<u64>())]
+ }
+ _ => unreachable!(),
+ },
+ DataType::List(_) => {
+ // safe because for List the first buffer is guaranteed to contain
i32 offsets
+ vec![buffers[0].slice(offset * size_of::<i32>())]
+ }
+ DataType::LargeList(_) => {
+ // safe because for LargeList the first buffer is guaranteed to
contain i64 offsets
+ vec![buffers[0].slice(offset * size_of::<i64>())]
+ }
+ DataType::Binary | DataType::Utf8 => {
+ // safe because for Binary/Utf8 the first buffer is guaranteed to
contain i32 offsets
+ vec![
+ buffers[0].slice(offset * size_of::<i32>()),
+ buffers[1].clone(),
+ ]
+ }
+ DataType::LargeBinary | DataType::LargeUtf8 => {
+ // safe because for LargeBinary/LargeUtf8 the first buffer is
guaranteed to contain i64 offsets
+ vec![
+ buffers[0].slice(offset * size_of::<i64>()),
+ buffers[1].clone(),
+ ]
+ }
+ DataType::FixedSizeBinary(size) => {
+ vec![buffers[0].slice(offset * (*size as usize))]
+ }
+ DataType::FixedSizeList(_field, size) => {
+ result_child_data = Some(
+ child_data
+ .iter()
+ .map(|d| {
+ slice_array_data(
+ d,
+ offset * (*size as usize),
+ len * (*size as usize),
+ )
+ })
+ .collect(),
+ );
+ vec![]
+ }
+ DataType::Struct(_) => {
+ result_child_data = Some(
+ child_data
+ .iter()
+ .map(|d| slice_array_data(d, offset, len))
+ .collect(),
+ );
+ vec![]
+ }
+ DataType::Union(..) => {
+ // TODO: verify this is actually correct
+ if buffers.len() > 1 {
+ // dense union, type ids and offsets
+ vec![
+ buffers[0].slice(offset * size_of::<i8>()),
+ buffers[1].slice(offset * size_of::<u32>()),
+ ]
+ } else {
+ // sparse union with only type ids
+ // should this also slice the child arrays?
+ vec![buffers[0].slice(offset * size_of::<i8>())]
+ }
+ }
+ DataType::Map(_, _) => {
+ // TODO: verify this is actually correct
Review comment:
This is probably not correct since Map has offsets like a list, only the
offsets buffer should need to be sliced.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]