neilconway commented on code in PR #20620:
URL: https://github.com/apache/datafusion/pull/20620#discussion_r2872955838
##########
datafusion/functions-nested/src/concat.rs:
##########
@@ -396,58 +396,55 @@ fn concat_internal<O: OffsetSizeTrait>(args: &[ArrayRef])
-> Result<ArrayRef> {
.iter()
.map(|arg| as_generic_list_array::<O>(arg))
.collect::<Result<Vec<_>>>()?;
- // Assume number of rows is the same for all arrays
let row_count = list_arrays[0].len();
- let mut array_lengths = vec![];
- let mut arrays = vec![];
+ // Extract underlying values ArrayData from each list array for
MutableArrayData.
+ let values_data: Vec<ArrayData> =
+ list_arrays.iter().map(|la| la.values().to_data()).collect();
+ let values_data_refs: Vec<&ArrayData> = values_data.iter().collect();
+
+ // Estimate capacity as the sum of all values arrays' lengths.
+ let total_capacity: usize = values_data.iter().map(|d| d.len()).sum();
+
+ let mut mutable = MutableArrayData::with_capacities(
+ values_data_refs,
+ false,
+ Capacities::Array(total_capacity),
+ );
+ let mut offsets: Vec<O> = Vec::with_capacity(row_count + 1);
+ offsets.push(O::zero());
let mut valid = NullBufferBuilder::new(row_count);
- for i in 0..row_count {
- let nulls = list_arrays
- .iter()
- .map(|arr| arr.is_null(i))
- .collect::<Vec<_>>();
- // If all the arrays are null, the concatenated array is null
- let is_null = nulls.iter().all(|&x| x);
- if is_null {
- array_lengths.push(0);
- valid.append_null();
- } else {
- // Get all the arrays on i-th row
- let values = list_arrays
- .iter()
- .map(|arr| arr.value(i))
- .collect::<Vec<_>>();
-
- let elements = values
- .iter()
- .map(|a| a.as_ref())
- .collect::<Vec<&dyn Array>>();
-
- // Concatenated array on i-th row
- let concatenated_array =
arrow::compute::concat(elements.as_slice())?;
- array_lengths.push(concatenated_array.len());
- arrays.push(concatenated_array);
+ for row_idx in 0..row_count {
+ let mut row_has_values = false;
Review Comment:
Good idea! (It's a bit trickier because the NULL semantics of `array_concat`
mean we can't use `NullBuffer::union` -- an output row is NULL if ALL of the
concat inputs for that row are NULL. But we can do it by taking the bitwise OR
of the per-row null bitmaps.)
This is 12-22% faster than the previous approach.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]