alamb commented on code in PR #9643:
URL: https://github.com/apache/arrow-rs/pull/9643#discussion_r3092910737
##########
arrow-select/src/take.rs:
##########
@@ -624,23 +623,65 @@ where
OffsetType::Native: OffsetSizeTrait,
PrimitiveArray<OffsetType>: From<Vec<OffsetType::Native>>,
{
- // TODO: Some optimizations can be done here such as if it is
- // taking the whole list or a contiguous sublist
- let (list_indices, offsets, null_buf) =
- take_value_indices_from_list::<IndexType, OffsetType>(values,
indices)?;
-
- let taken = take_impl::<OffsetType>(values.values().as_ref(),
&list_indices)?;
- let value_offsets = Buffer::from_vec(offsets);
- // create a new list with taken data and computed null information
+ let list_offsets = values.value_offsets();
+ let child_data = values.values().to_data();
+ let nulls = take_nulls(values.nulls(), indices);
+
+ let mut new_offsets = Vec::with_capacity(indices.len() + 1);
+ new_offsets.push(OffsetType::Native::zero());
+
+ let use_nulls = child_data.null_count() > 0;
+
+ let capacity = child_data
+ .len()
+ .checked_div(values.len())
+ .map(|v| v * indices.len())
+ .unwrap_or_default();
+
+ let mut array_data = MutableArrayData::new(vec![&child_data], use_nulls,
capacity);
+
+ match nulls.as_ref().filter(|n| n.null_count() > 0) {
+ None => {
+ for index in indices.values() {
+ let ix = index.as_usize();
+ let start = list_offsets[ix].as_usize();
+ let end = list_offsets[ix + 1].as_usize();
+ array_data.extend(0, start, end);
+
new_offsets.push(OffsetType::Native::from_usize(array_data.len()).unwrap());
+ }
+ }
+ Some(output_nulls) => {
+ new_offsets.resize(indices.len() + 1, OffsetType::Native::zero());
Review Comment:
Why initialize the offsets to zero, when they are immediately overwritten?
You could probably use `push` and `extend` instead of `fill` and setting
offsets directly
##########
arrow-select/src/take.rs:
##########
@@ -624,23 +623,65 @@ where
OffsetType::Native: OffsetSizeTrait,
PrimitiveArray<OffsetType>: From<Vec<OffsetType::Native>>,
{
- // TODO: Some optimizations can be done here such as if it is
- // taking the whole list or a contiguous sublist
- let (list_indices, offsets, null_buf) =
- take_value_indices_from_list::<IndexType, OffsetType>(values,
indices)?;
-
- let taken = take_impl::<OffsetType>(values.values().as_ref(),
&list_indices)?;
- let value_offsets = Buffer::from_vec(offsets);
- // create a new list with taken data and computed null information
+ let list_offsets = values.value_offsets();
+ let child_data = values.values().to_data();
+ let nulls = take_nulls(values.nulls(), indices);
+
+ let mut new_offsets = Vec::with_capacity(indices.len() + 1);
+ new_offsets.push(OffsetType::Native::zero());
+
+ let use_nulls = child_data.null_count() > 0;
+
+ let capacity = child_data
+ .len()
+ .checked_div(values.len())
+ .map(|v| v * indices.len())
+ .unwrap_or_default();
+
+ let mut array_data = MutableArrayData::new(vec![&child_data], use_nulls,
capacity);
+
+ match nulls.as_ref().filter(|n| n.null_count() > 0) {
+ None => {
+ for index in indices.values() {
+ let ix = index.as_usize();
+ let start = list_offsets[ix].as_usize();
+ let end = list_offsets[ix + 1].as_usize();
+ array_data.extend(0, start, end);
+
new_offsets.push(OffsetType::Native::from_usize(array_data.len()).unwrap());
+ }
+ }
+ Some(output_nulls) => {
+ new_offsets.resize(indices.len() + 1, OffsetType::Native::zero());
+ let mut last_filled = 0;
+ for i in output_nulls.valid_indices() {
+ let current =
OffsetType::Native::from_usize(array_data.len()).unwrap();
+ if last_filled < i {
+ new_offsets[last_filled + 1..=i].fill(current);
+ }
+ // SAFETY: `i` comes from validity bitmap over `indices`, so
in-bounds.
Review Comment:
I agree this is safe. However, I would feel better if we also added add an
assert above that verifies that `assert_eq!(nulls.len() + 1,indices.len())` as
an extra layer of protection
##########
arrow-select/src/take.rs:
##########
@@ -624,23 +623,65 @@ where
OffsetType::Native: OffsetSizeTrait,
PrimitiveArray<OffsetType>: From<Vec<OffsetType::Native>>,
{
- // TODO: Some optimizations can be done here such as if it is
- // taking the whole list or a contiguous sublist
- let (list_indices, offsets, null_buf) =
- take_value_indices_from_list::<IndexType, OffsetType>(values,
indices)?;
-
- let taken = take_impl::<OffsetType>(values.values().as_ref(),
&list_indices)?;
- let value_offsets = Buffer::from_vec(offsets);
- // create a new list with taken data and computed null information
+ let list_offsets = values.value_offsets();
+ let child_data = values.values().to_data();
+ let nulls = take_nulls(values.nulls(), indices);
+
+ let mut new_offsets = Vec::with_capacity(indices.len() + 1);
+ new_offsets.push(OffsetType::Native::zero());
+
+ let use_nulls = child_data.null_count() > 0;
+
+ let capacity = child_data
+ .len()
+ .checked_div(values.len())
+ .map(|v| v * indices.len())
+ .unwrap_or_default();
+
+ let mut array_data = MutableArrayData::new(vec![&child_data], use_nulls,
capacity);
+
+ match nulls.as_ref().filter(|n| n.null_count() > 0) {
+ None => {
+ for index in indices.values() {
+ let ix = index.as_usize();
+ let start = list_offsets[ix].as_usize();
+ let end = list_offsets[ix + 1].as_usize();
+ array_data.extend(0, start, end);
+
new_offsets.push(OffsetType::Native::from_usize(array_data.len()).unwrap());
+ }
+ }
+ Some(output_nulls) => {
+ new_offsets.resize(indices.len() + 1, OffsetType::Native::zero());
+ let mut last_filled = 0;
+ for i in output_nulls.valid_indices() {
+ let current =
OffsetType::Native::from_usize(array_data.len()).unwrap();
+ if last_filled < i {
Review Comment:
it might help to add comments here explaining that this fills in offsets for
null values
##########
arrow-select/src/take.rs:
##########
@@ -2497,39 +2538,6 @@ mod tests {
)
}
- #[test]
Review Comment:
It would make it easier to see what you have changed if you didn't also move
the tests around
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]