jhorstmann commented on a change in pull request #8170:
URL: https://github.com/apache/arrow/pull/8170#discussion_r487396427
##########
File path: rust/arrow/src/compute/kernels/take.rs
##########
@@ -166,42 +166,124 @@ fn take_primitive<T>(values: &ArrayRef, indices:
&UInt32Array) -> Result<ArrayRe
where
T: ArrowPrimitiveType,
{
- let mut builder = PrimitiveBuilder::<T>::new(indices.len());
- let a = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
- for i in 0..indices.len() {
- if indices.is_null(i) {
- // populate with null if index is null
- builder.append_null()?;
- } else {
- // get index value to use in looking up the value from `values`
- let ix = indices.value(i) as usize;
- if a.is_valid(ix) {
- builder.append_value(a.value(ix))?;
- } else {
- builder.append_null()?;
+ let data_len = indices.len();
+
+ let array = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
+
+ let num_bytes = bit_util::ceil(data_len, 8);
+ let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes,
false);
+
+ let null_slice = null_buf.data_mut();
+
+ let new_values: Vec<T::Native> = (0..data_len)
+ .map(|i| {
+ let index = indices.value(i) as usize;
+ if array.is_valid(index) {
+ bit_util::set_bit(null_slice, i);
+ }
+ array.value(index)
+ })
+ .collect();
+
+ let nulls = match indices.data_ref().null_buffer() {
+ Some(buffer) => buffer_bin_and(buffer, 0, &null_buf.freeze(), 0,
indices.len()),
+ None => null_buf.freeze(),
+ };
+
+ let data = ArrayData::new(
+ T::get_data_type(),
+ indices.len(),
+ None,
+ Some(nulls),
+ 0,
+ vec![Buffer::from(new_values.to_byte_slice())],
+ vec![],
+ );
+ return Ok(Arc::new(PrimitiveArray::<T>::from(Arc::new(data))));
+}
+
+/// `take` implementation for boolean arrays
+fn take_boolean(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
+ let data_len = indices.len();
+
+ let array = values.as_any().downcast_ref::<BooleanArray>().unwrap();
+
+ let num_byte = bit_util::ceil(data_len, 8);
+ let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte,
false);
+ let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte,
false);
+
+ let null_slice = null_buf.data_mut();
+ let val_slice = val_buf.data_mut();
+
+ (0..data_len).for_each(|i| {
+ let index = indices.value(i) as usize;
+ if array.is_valid(index) {
+ bit_util::set_bit(null_slice, i);
+ if array.value(index) {
+ bit_util::set_bit(val_slice, i);
}
}
- }
- Ok(Arc::new(builder.finish()) as ArrayRef)
+ });
+
+ let nulls = match indices.data_ref().null_buffer() {
+ Some(buffer) => buffer_bin_and(buffer, 0, &null_buf.freeze(), 0,
indices.len()),
+ None => null_buf.freeze(),
+ };
+
+ let data = ArrayData::new(
+ DataType::Boolean,
+ indices.len(),
+ None,
+ Some(nulls),
+ 0,
+ vec![val_buf.freeze()],
+ vec![],
+ );
+ return Ok(Arc::new(BooleanArray::from(Arc::new(data))));
}
/// `take` implementation for string arrays
fn take_string(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
- let mut builder = StringBuilder::new(indices.len());
- let a = values.as_any().downcast_ref::<StringArray>().unwrap();
- for i in 0..indices.len() {
- if indices.is_null(i) {
- builder.append(false)?;
+ let data_len = indices.len();
+
+ let array = values.as_any().downcast_ref::<StringArray>().unwrap();
+
+ let num_bytes = bit_util::ceil(data_len, 8);
+ let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes,
false);
+
+ let mut offsets = Vec::with_capacity(data_len + 1);
+ let mut values = Vec::with_capacity(data_len);
+ let mut length_so_far = 0;
+
+ offsets.push(length_so_far);
+ for i in 0..data_len {
+ let index = indices.value(i) as usize;
+
+ let s = if array.is_valid(index) {
Review comment:
I think for strings it might make sense to also check the index validity
inside the loop, to keep the resulting string data small. If the index is null
it's value is probably 0 and in the worst case there could be very long string
there in the original array.
Another question, also for the primitive case, would be whether we can
actually rely on the value being 0 when the bitmap says it is null. That might
no longer be the case if you for example apply the `take` kernel twice.
##########
File path: rust/arrow/src/compute/kernels/take.rs
##########
@@ -166,42 +166,124 @@ fn take_primitive<T>(values: &ArrayRef, indices:
&UInt32Array) -> Result<ArrayRe
where
T: ArrowPrimitiveType,
{
- let mut builder = PrimitiveBuilder::<T>::new(indices.len());
- let a = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
- for i in 0..indices.len() {
- if indices.is_null(i) {
- // populate with null if index is null
- builder.append_null()?;
- } else {
- // get index value to use in looking up the value from `values`
- let ix = indices.value(i) as usize;
- if a.is_valid(ix) {
- builder.append_value(a.value(ix))?;
- } else {
- builder.append_null()?;
+ let data_len = indices.len();
+
+ let array = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
+
+ let num_bytes = bit_util::ceil(data_len, 8);
+ let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes,
false);
+
+ let null_slice = null_buf.data_mut();
+
+ let new_values: Vec<T::Native> = (0..data_len)
+ .map(|i| {
+ let index = indices.value(i) as usize;
+ if array.is_valid(index) {
+ bit_util::set_bit(null_slice, i);
+ }
+ array.value(index)
+ })
+ .collect();
+
+ let nulls = match indices.data_ref().null_buffer() {
+ Some(buffer) => buffer_bin_and(buffer, 0, &null_buf.freeze(), 0,
indices.len()),
+ None => null_buf.freeze(),
+ };
+
+ let data = ArrayData::new(
+ T::get_data_type(),
+ indices.len(),
+ None,
+ Some(nulls),
+ 0,
+ vec![Buffer::from(new_values.to_byte_slice())],
+ vec![],
+ );
+ return Ok(Arc::new(PrimitiveArray::<T>::from(Arc::new(data))));
+}
+
+/// `take` implementation for boolean arrays
+fn take_boolean(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
+ let data_len = indices.len();
+
+ let array = values.as_any().downcast_ref::<BooleanArray>().unwrap();
+
+ let num_byte = bit_util::ceil(data_len, 8);
+ let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte,
false);
+ let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte,
false);
+
+ let null_slice = null_buf.data_mut();
+ let val_slice = val_buf.data_mut();
+
+ (0..data_len).for_each(|i| {
+ let index = indices.value(i) as usize;
+ if array.is_valid(index) {
+ bit_util::set_bit(null_slice, i);
+ if array.value(index) {
+ bit_util::set_bit(val_slice, i);
}
}
- }
- Ok(Arc::new(builder.finish()) as ArrayRef)
+ });
+
+ let nulls = match indices.data_ref().null_buffer() {
+ Some(buffer) => buffer_bin_and(buffer, 0, &null_buf.freeze(), 0,
indices.len()),
+ None => null_buf.freeze(),
+ };
+
+ let data = ArrayData::new(
+ DataType::Boolean,
+ indices.len(),
+ None,
+ Some(nulls),
+ 0,
+ vec![val_buf.freeze()],
+ vec![],
+ );
+ return Ok(Arc::new(BooleanArray::from(Arc::new(data))));
}
/// `take` implementation for string arrays
fn take_string(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
- let mut builder = StringBuilder::new(indices.len());
- let a = values.as_any().downcast_ref::<StringArray>().unwrap();
- for i in 0..indices.len() {
- if indices.is_null(i) {
- builder.append(false)?;
+ let data_len = indices.len();
+
+ let array = values.as_any().downcast_ref::<StringArray>().unwrap();
+
+ let num_bytes = bit_util::ceil(data_len, 8);
+ let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes,
false);
+
+ let mut offsets = Vec::with_capacity(data_len + 1);
+ let mut values = Vec::with_capacity(data_len);
+ let mut length_so_far = 0;
+
+ offsets.push(length_so_far);
+ for i in 0..data_len {
+ let index = indices.value(i) as usize;
+
+ let s = if array.is_valid(index) {
Review comment:
I think for strings it might make sense to also check the index validity
inside the loop, to keep the resulting string data small. If the index is null
it's value is probably 0 and in the worst case there could be very long string
there in the original array.
Another question, also for the primitive case, would be whether we can
actually rely on the value being 0 when the bitmap says it is null. That might
no longer be the case if you for example apply the `take` kernel twice.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]