neilconway commented on code in PR #22390:
URL: https://github.com/apache/datafusion/pull/22390#discussion_r3282490876
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -468,6 +533,113 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
)?))
}
+/// For each element of `list_array[i]`, removed up to `arr_n[i]` occurrences
+/// of `needle[0]` (scalar element broadcasted).
+///
+/// This is a specialized version of `general_remove` for scalar elements that
+/// uses bulk comparison for better performance.
+fn general_remove_with_scalar<OffsetSize: OffsetSizeTrait>(
+ list_array: &GenericListArray<OffsetSize>,
+ needle: &ArrayRef,
+ arr_n: &[i64],
+) -> Result<ArrayRef> {
+ let list_field = match list_array.data_type() {
+ DataType::List(field) | DataType::LargeList(field) => field,
+ _ => {
+ return exec_err!(
+ "Expected List or LargeList data type, got {:?}",
+ list_array.data_type()
+ );
+ }
+ };
+
+ let list_offsets = list_array.offsets();
+ let first_offset = list_offsets[0].to_usize().unwrap();
+ let last_offset = list_offsets[list_offsets.len() - 1].to_usize().unwrap();
+ let values_range_len = last_offset - first_offset;
+ let values_slice = list_array.values().slice(first_offset,
values_range_len);
+ let original_data = values_slice.to_data();
+ let mut offsets = Vec::<OffsetSize>::with_capacity(list_array.len() + 1);
+ offsets.push(OffsetSize::zero());
+
+ let mut mutable = MutableArrayData::with_capacities(
+ vec![&original_data],
+ false,
+ Capacities::Array(original_data.len()),
+ );
+ let nulls = list_array.nulls().cloned();
+ let keep_mask =
+ arrow_ord::cmp::distinct(list_array.values(),
&Scalar::new(Arc::clone(needle)))?;
+ let remove_bits = match keep_mask.nulls() {
+ Some(validity) => !(&(keep_mask.values() & validity.inner())),
+ None => !keep_mask.values(),
+ };
+
+ for (row_index, offset_window) in list_offsets.windows(2).enumerate() {
+ if nulls.as_ref().is_some_and(|nulls| nulls.is_null(row_index)) {
+ offsets.push(offsets[row_index]);
+ continue;
+ }
+
+ let start = offset_window[0].to_usize().unwrap() - first_offset;
+ let end = offset_window[1].to_usize().unwrap() - first_offset;
+
+ let n = arr_n[row_index];
+
+ if n <= 0 {
+ mutable.extend(0, start, end);
+ offsets.push(offsets[row_index] + OffsetSize::usize_as(end -
start));
+ continue;
+ }
+
+ let row_len = end - start;
+ let row_remove_bits = remove_bits.slice(first_offset + start, row_len);
+ let num_to_remove = row_remove_bits.count_set_bits();
+
+ if num_to_remove == 0 {
+ mutable.extend(0, start, end);
+ offsets.push(offsets[row_index] + OffsetSize::usize_as(row_len));
+ continue;
+ }
+
+ let max_removals = n.min(num_to_remove as i64) as usize;
+
+ // Iterate only over the positions that need removal using set_indices,
+ // which is more efficient than scanning every bit.
Review Comment:
Might be worth elaborating that the win here is mostly because we expect the
# of values-to-remove is a lot smaller than the total array size, which it
usually (but not always) will be.
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -468,6 +531,98 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
)?))
}
+/// For each element of `list_array[i]`, removed up to `arr_n[i]` occurrences
+/// of `needle[0]` (scalar element broadcasted).
+///
+/// This is a specialized version of `general_remove` for scalar elements that
+/// uses bulk comparison for better performance.
+fn general_remove_with_scalar<OffsetSize: OffsetSizeTrait>(
+ list_array: &GenericListArray<OffsetSize>,
+ needle: &ArrayRef,
+ arr_n: &[i64],
+) -> Result<ArrayRef> {
+ let list_field = match list_array.data_type() {
+ DataType::List(field) | DataType::LargeList(field) => field,
+ _ => {
+ return exec_err!(
+ "Expected List or LargeList data type, got {:?}",
+ list_array.data_type()
+ );
+ }
+ };
+ let original_data = list_array.values().to_data();
Review Comment:
The over-allocation was one part, but the bigger concern is calling the
`distinct` kernel on the entire values buffer (see other comment).
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -468,6 +533,113 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
)?))
}
+/// For each element of `list_array[i]`, removed up to `arr_n[i]` occurrences
+/// of `needle[0]` (scalar element broadcasted).
+///
+/// This is a specialized version of `general_remove` for scalar elements that
+/// uses bulk comparison for better performance.
+fn general_remove_with_scalar<OffsetSize: OffsetSizeTrait>(
+ list_array: &GenericListArray<OffsetSize>,
+ needle: &ArrayRef,
+ arr_n: &[i64],
+) -> Result<ArrayRef> {
+ let list_field = match list_array.data_type() {
+ DataType::List(field) | DataType::LargeList(field) => field,
+ _ => {
+ return exec_err!(
+ "Expected List or LargeList data type, got {:?}",
+ list_array.data_type()
+ );
+ }
+ };
+
+ let list_offsets = list_array.offsets();
+ let first_offset = list_offsets[0].to_usize().unwrap();
+ let last_offset = list_offsets[list_offsets.len() - 1].to_usize().unwrap();
+ let values_range_len = last_offset - first_offset;
+ let values_slice = list_array.values().slice(first_offset,
values_range_len);
+ let original_data = values_slice.to_data();
+ let mut offsets = Vec::<OffsetSize>::with_capacity(list_array.len() + 1);
+ offsets.push(OffsetSize::zero());
+
+ let mut mutable = MutableArrayData::with_capacities(
+ vec![&original_data],
+ false,
+ Capacities::Array(original_data.len()),
+ );
+ let nulls = list_array.nulls().cloned();
+ let keep_mask =
+ arrow_ord::cmp::distinct(list_array.values(),
&Scalar::new(Arc::clone(needle)))?;
Review Comment:
This will call the distinct kernel on all the elements in the value buffer,
not just the ones that are visible in a sliced array.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]