jorgecarleitao commented on a change in pull request #9301: URL: https://github.com/apache/arrow/pull/9301#discussion_r563186250
########## File path: rust/arrow/src/compute/kernels/take.rs ########## @@ -254,6 +254,154 @@ impl Default for TakeOptions { } } +#[inline] +fn maybe_take<T: ArrowPrimitiveType, I: ArrowPrimitiveType>( + values: &[T::Native], + index: I::Native, +) -> Result<(usize, T::Native)> +where + I::Native: ToPrimitive, +{ + match ToPrimitive::to_usize(&index) { + // there are two potential sources of errors here: + // 1. the index is out of bounds (panic as per `take` docs) + // 2. the index cannot be converted to a usize (mostly 32 bit systems) + Some(index) => Ok((index, values[index])), + None => Err(ArrowError::ComputeError("Cast to usize failed".to_string())), + } +} + +// take implementation when neither values nor indices contain nulls +fn take_no_nulls<T, I>( + values: &[T::Native], + indices: &[I::Native], +) -> Result<(Buffer, Option<Buffer>)> +where + T: ArrowPrimitiveType, + I: ArrowNumericType, + I::Native: ToPrimitive, +{ + let buffer = indices + .iter() + .map(|index| maybe_take::<T, I>(values, *index).map(|x| x.1)) + .collect::<Result<Buffer>>()?; + + Ok((buffer, None)) +} + +// take implementation when only values contain nulls +fn take_values_nulls<T, I>( + values: &PrimitiveArray<T>, + indices: &[I::Native], +) -> Result<(Buffer, Option<Buffer>)> +where + T: ArrowPrimitiveType, + I: ArrowNumericType, + I::Native: ToPrimitive, +{ + let num_bytes = bit_util::ceil(indices.len(), 8); + let mut nulls = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); + let null_slice = nulls.as_slice_mut(); + let mut null_count = 0; + + let values_values = values.values(); + + let buffer = indices + .iter() + .enumerate() + .map(|(i, index)| { + let (index, value) = maybe_take::<T, I>(values_values, *index)?; + if values.is_null(index) { + null_count += 1; + bit_util::unset_bit(null_slice, i); + } + Ok(value) + }) + .collect::<Result<Buffer>>()?; + + let nulls = if null_count == 0 { + // if only non-null values were taken + None + } else { + Some(nulls.into()) + }; + + Ok((buffer, nulls)) +} + +// take implementation when only indices contain nulls +fn take_indices_nulls<T, I>( + values: &[T::Native], + indices: &PrimitiveArray<I>, +) -> Result<(Buffer, Option<Buffer>)> +where + T: ArrowPrimitiveType, + I: ArrowNumericType, + I::Native: ToPrimitive, +{ + let buffer = indices + .iter() + .map(|index| { + match index { + // valid index => try to take using it + Some(index) => maybe_take::<T, I>(values, index).map(|x| x.1), + // null index => do not + None => Ok(T::Native::default()), + } + }) + .collect::<Result<Buffer>>()?; + + Ok((buffer, indices.data_ref().null_buffer().cloned())) +} + +// take implementation when both values and indices contain nulls +fn take_values_indices_nulls<T, I>( + values: &PrimitiveArray<T>, + indices: &PrimitiveArray<I>, +) -> Result<(Buffer, Option<Buffer>)> +where + T: ArrowPrimitiveType, + I: ArrowNumericType, + I::Native: ToPrimitive, +{ + let num_bytes = bit_util::ceil(indices.len(), 8); + let mut nulls = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); + let null_slice = nulls.as_slice_mut(); + let mut null_count = 0; + + let values_values = values.values(); + // in this branch it is **not** ok to read `index.values()` and try to use it, + // as doing so is UB when they come from a null slot. Review comment: I think that this becomes more academic at this point, and I do not know exactly why, but, in Rust, reading an initialized value is undefined behavior, see e.g. https://doc.rust-lang.org/std/mem/union.MaybeUninit.html We are abusing this in a lot of places, specially when we perform SIMD operations where we disregard the null bitmap and do the op over the whole buffer. I do not have a good answer here, at this point is more like "it works". In this particular case, though, the issue is that we are doing `*values.as_ptr().offset(index.to_usize())` where `index` comes from a null slot, which is very much an OOB read. This comment is about that: any operation on `index` in UB (even in trying to cast it to `usize`, since the previous values may represent `-1i32`, as offsets are signed). ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org