[GitHub] [arrow-rs] jhorstmann commented on a change in pull request #1499: Alternative implementation of nullif kernel by slicing nested buffers

GitBox Fri, 01 Apr 2022 02:46:16 -0700


jhorstmann commented on a change in pull request #1499:
URL: https://github.com/apache/arrow-rs/pull/1499#discussion_r840416500




##########
File path: arrow/src/compute/kernels/boolean.rs
##########
@@ -575,10 +577,241 @@ where
     Ok(PrimitiveArray::<T>::from(data))
 }
 
+/// Creates a (mostly) zero-copy slice of the given buffers so that they can 
be combined
+/// in the same array with other buffers that start at offset 0.
+/// The only buffers that need an actual copy are booleans and validity (if 
they are not byte-aligned).
+/// This is useful when a kernel calculates a new validity bitmap but wants to 
reuse other buffers.
+fn slice_buffers(
+    buffers: &[Buffer],
+    offset: usize,
+    len: usize,
+    data_type: &DataType,
+    child_data: &[ArrayData],
+) -> (Vec<Buffer>, Vec<ArrayData>) {
+    use std::mem::size_of;
+
+    if offset == 0 {
+        return (buffers.to_vec(), child_data.to_vec());
+    }
+
+    // we only need to do something special to child data in 2 of the match 
branches
+    let mut result_child_data = None;
+
+    let result_buffers = match data_type {
+        DataType::Boolean => vec![buffers[0].bit_slice(offset, len)],
+        DataType::Int8 | DataType::UInt8 => {
+            vec![buffers[0].slice(offset * size_of::<u8>())]
+        }
+        DataType::Int16 | DataType::UInt16 | DataType::Float16 => {
+            vec![buffers[0].slice(offset * size_of::<u16>())]
+        }
+        DataType::Int32 | DataType::UInt32 | DataType::Float32 => {
+            vec![buffers[0].slice(offset * size_of::<u32>())]
+        }
+        DataType::Int64 | DataType::UInt64 | DataType::Float64 => {
+            vec![buffers[0].slice(offset * size_of::<u64>())]
+        }
+        DataType::Timestamp(_, _) | DataType::Duration(_) => {
+            vec![buffers[0].slice(offset * size_of::<u64>())]
+        }
+        DataType::Date32 | DataType::Time32(_) => {
+            vec![buffers[0].slice(offset * size_of::<u32>())]
+        }
+        DataType::Date64 | DataType::Time64(_) => {
+            vec![buffers[0].slice(offset * size_of::<u64>())]
+        }
+        DataType::Interval(IntervalUnit::YearMonth) => {
+            vec![buffers[0].slice(offset * size_of::<u32>())]
+        }
+        DataType::Interval(IntervalUnit::DayTime) => {
+            vec![buffers[0].slice(offset * 2 * size_of::<u32>())]
+        }
+        DataType::Interval(IntervalUnit::MonthDayNano) => {
+            vec![buffers[0].slice(offset * (2 * size_of::<u32>() + 
size_of::<u64>()))]
+        }
+        DataType::Decimal(_, _) => vec![buffers[0].slice(offset * 
size_of::<i128>())],
+        DataType::Dictionary(key_type, _) => match key_type.as_ref() {
+            DataType::Int8 | DataType::UInt8 => {
+                vec![buffers[0].slice(offset * size_of::<u8>())]
+            }
+            DataType::Int16 | DataType::UInt16 => {
+                vec![buffers[0].slice(offset * size_of::<u16>())]
+            }
+            DataType::Int32 | DataType::UInt32 => {
+                vec![buffers[0].slice(offset * size_of::<u32>())]
+            }
+            DataType::Int64 | DataType::UInt64 => {
+                vec![buffers[0].slice(offset * size_of::<u64>())]
+            }
+            _ => unreachable!(),
+        },
+        DataType::List(_) => {
+            // safe because for List the first buffer is guaranteed to contain 
i32 offsets
+            vec![buffers[0].slice(offset * size_of::<i32>())]
+        }
+        DataType::LargeList(_) => {
+            // safe because for LargeList the first buffer is guaranteed to 
contain i64 offsets
+            vec![buffers[0].slice(offset * size_of::<i64>())]
+        }
+        DataType::Binary | DataType::Utf8 => {
+            // safe because for Binary/Utf8 the first buffer is guaranteed to 
contain i32 offsets
+            vec![
+                buffers[0].slice(offset * size_of::<i32>()),
+                buffers[1].clone(),
+            ]
+        }
+        DataType::LargeBinary | DataType::LargeUtf8 => {
+            // safe because for LargeBinary/LargeUtf8 the first buffer is 
guaranteed to contain i64 offsets
+            vec![
+                buffers[0].slice(offset * size_of::<i64>()),
+                buffers[1].clone(),
+            ]
+        }
+        DataType::FixedSizeBinary(size) => {
+            vec![buffers[0].slice(offset * (*size as usize))]
+        }
+        DataType::FixedSizeList(_field, size) => {
+            result_child_data = Some(
+                child_data
+                    .iter()
+                    .map(|d| {
+                        slice_array_data(
+                            d,
+                            offset * (*size as usize),
+                            len * (*size as usize),
+                        )
+                    })
+                    .collect(),
+            );
+            vec![]
+        }
+        DataType::Struct(_) => {
+            result_child_data = Some(
+                child_data
+                    .iter()
+                    .map(|d| slice_array_data(d, offset, len))
+                    .collect(),
+            );
+            vec![]
+        }
+        DataType::Union(..) => {
+            // TODO: verify this is actually correct
+            if buffers.len() > 1 {
+                // dense union, type ids and offsets
+                vec![
+                    buffers[0].slice(offset * size_of::<i8>()),
+                    buffers[1].slice(offset * size_of::<u32>()),
+                ]
+            } else {
+                // sparse union with only type ids
+                // should this also slice the child arrays?
+                vec![buffers[0].slice(offset * size_of::<i8>())]
+            }
+        }
+        DataType::Map(_, _) => {
+            // TODO: verify this is actually correct

Review comment:
       This is probably not correct since Map has offsets like a list, only the 
offsets buffer should need to be sliced.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-rs] jhorstmann commented on a change in pull request #1499: Alternative implementation of nullif kernel by slicing nested buffers

Reply via email to