lyne7-sc commented on code in PR #20243:
URL: https://github.com/apache/datafusion/pull/20243#discussion_r2788536237
##########
datafusion/functions-nested/src/set_ops.rs:
##########
@@ -358,69 +364,84 @@ fn generic_set_lists<OffsetSize: OffsetSizeTrait>(
"{set_op:?} is not implemented for '{l:?}' and '{r:?}'"
);
- let mut offsets = vec![OffsetSize::usize_as(0)];
- let mut new_arrays = vec![];
+ // Convert all values to rows in batch for performance.
let converter = RowConverter::new(vec![SortField::new(l.value_type())])?;
- for (l_arr, r_arr) in l.iter().zip(r.iter()) {
- let last_offset = *offsets.last().unwrap();
-
- let (l_values, r_values) = match (l_arr, r_arr) {
- (Some(l_arr), Some(r_arr)) => (
- converter.convert_columns(&[l_arr])?,
- converter.convert_columns(&[r_arr])?,
- ),
- _ => {
- offsets.push(last_offset);
- continue;
- }
- };
-
- let l_iter = l_values.iter().sorted().dedup();
- let values_set: HashSet<_> = l_iter.clone().collect();
- let mut rows = if set_op == SetOp::Union {
- l_iter.collect()
- } else {
- vec![]
- };
+ let rows_l = converter.convert_columns(&[Arc::clone(l.values())])?;
+ let rows_r = converter.convert_columns(&[Arc::clone(r.values())])?;
+ let l_offsets = l.value_offsets();
+ let r_offsets = r.value_offsets();
+
+ let mut result_offsets = Vec::with_capacity(l.len() + 1);
+ result_offsets.push(OffsetSize::usize_as(0));
+ let mut final_rows = Vec::with_capacity(rows_l.num_rows());
Review Comment:
updated the capacity for `SetOp::Intersect` to `min(rows_l.num_rows(),
rows_r.num_rows())`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]