Dandandan commented on a change in pull request #9301:
URL: https://github.com/apache/arrow/pull/9301#discussion_r563175843



##########
File path: rust/arrow/src/compute/kernels/take.rs
##########
@@ -254,6 +254,154 @@ impl Default for TakeOptions {
     }
 }
 
+#[inline]
+fn maybe_take<T: ArrowPrimitiveType, I: ArrowPrimitiveType>(
+    values: &[T::Native],
+    index: I::Native,
+) -> Result<(usize, T::Native)>
+where
+    I::Native: ToPrimitive,
+{
+    match ToPrimitive::to_usize(&index) {
+        // there are two potential sources of errors here:
+        // 1. the index is out of bounds (panic as per `take` docs)
+        // 2. the index cannot be converted to a usize (mostly 32 bit systems)
+        Some(index) => Ok((index, values[index])),
+        None => Err(ArrowError::ComputeError("Cast to usize 
failed".to_string())),
+    }
+}
+
+// take implementation when neither values nor indices contain nulls
+fn take_no_nulls<T, I>(
+    values: &[T::Native],
+    indices: &[I::Native],
+) -> Result<(Buffer, Option<Buffer>)>
+where
+    T: ArrowPrimitiveType,
+    I: ArrowNumericType,
+    I::Native: ToPrimitive,
+{
+    let buffer = indices
+        .iter()
+        .map(|index| maybe_take::<T, I>(values, *index).map(|x| x.1))
+        .collect::<Result<Buffer>>()?;
+
+    Ok((buffer, None))
+}
+
+// take implementation when only values contain nulls
+fn take_values_nulls<T, I>(
+    values: &PrimitiveArray<T>,
+    indices: &[I::Native],
+) -> Result<(Buffer, Option<Buffer>)>
+where
+    T: ArrowPrimitiveType,
+    I: ArrowNumericType,
+    I::Native: ToPrimitive,
+{
+    let num_bytes = bit_util::ceil(indices.len(), 8);
+    let mut nulls = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
+    let null_slice = nulls.as_slice_mut();
+    let mut null_count = 0;
+
+    let values_values = values.values();
+
+    let buffer = indices
+        .iter()
+        .enumerate()
+        .map(|(i, index)| {
+            let (index, value) = maybe_take::<T, I>(values_values, *index)?;
+            if values.is_null(index) {
+                null_count += 1;
+                bit_util::unset_bit(null_slice, i);
+            }
+            Ok(value)
+        })
+        .collect::<Result<Buffer>>()?;
+
+    let nulls = if null_count == 0 {
+        // if only non-null values were taken
+        None
+    } else {
+        Some(nulls.into())
+    };
+
+    Ok((buffer, nulls))
+}
+
+// take implementation when only indices contain nulls
+fn take_indices_nulls<T, I>(
+    values: &[T::Native],
+    indices: &PrimitiveArray<I>,
+) -> Result<(Buffer, Option<Buffer>)>
+where
+    T: ArrowPrimitiveType,
+    I: ArrowNumericType,
+    I::Native: ToPrimitive,
+{
+    let buffer = indices
+        .iter()
+        .map(|index| {
+            match index {
+                // valid index => try to take using it
+                Some(index) => maybe_take::<T, I>(values, index).map(|x| x.1),
+                // null index => do not
+                None => Ok(T::Native::default()),
+            }
+        })
+        .collect::<Result<Buffer>>()?;
+
+    Ok((buffer, indices.data_ref().null_buffer().cloned()))
+}
+
+// take implementation when both values and indices contain nulls
+fn take_values_indices_nulls<T, I>(
+    values: &PrimitiveArray<T>,
+    indices: &PrimitiveArray<I>,
+) -> Result<(Buffer, Option<Buffer>)>
+where
+    T: ArrowPrimitiveType,
+    I: ArrowNumericType,
+    I::Native: ToPrimitive,
+{
+    let num_bytes = bit_util::ceil(indices.len(), 8);
+    let mut nulls = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
+    let null_slice = nulls.as_slice_mut();
+    let mut null_count = 0;
+
+    let values_values = values.values();
+    // in this branch it is **not** ok to read `index.values()` and try to use 
it,
+    // as doing so is UB when they come from a null slot.

Review comment:
       I am not 100% clear yet why this part itself is UB (as in reading 
outside bounds / uninitialized memory) as long as the null buffer is used 
correctly?
   Or maybe you mean that it could *trigger* UB when doing OOB reads when the 
value itself from the array would be bigger, which would trigger `UB` later on 
like before? At least when calling this I think the changes definitely makes 
sense, I wouldn't expect a panic (or OOB!) if the value in the indices was 
coming from a null slot :+1: 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to