neilconway commented on code in PR #20617:
URL: https://github.com/apache/datafusion/pull/20617#discussion_r2869049316


##########
datafusion/functions-nested/src/resize.rs:
##########
@@ -206,7 +206,103 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
     let values = array.values();
     let original_data = values.to_data();
 
-    // create default element array
+    // Track the largest per-row growth so the uniform-fill fast path can
+    // materialize one reusable fill buffer of the required size.
+    let mut max_extra: usize = 0;
+    for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
+        if array.is_null(row_index) {
+            continue;
+        }
+        let target_count = 
count_array.value(row_index).to_usize().ok_or_else(|| {
+            internal_datafusion_err!("array_resize: failed to convert size to 
usize")
+        })?;
+        let current_len = (offset_window[1] - 
offset_window[0]).to_usize().unwrap();
+        if target_count > current_len {
+            let extra = target_count - current_len;
+            if extra > max_extra {
+                max_extra = extra;
+            }
+        }
+    }
+
+    // The fast path is valid when at least one row grows and every row would
+    // use the same fill value.
+    let is_uniform_fill = max_extra > 0
+        && match &default_element {
+            None => true,
+            Some(fill_array) => {
+                let len = fill_array.len();
+                let null_count = fill_array.logical_null_count();
+
+                len <= 1
+                    || null_count == len
+                    || (null_count == 0 && {
+                        let first = fill_array.slice(0, 1);
+                        (1..len)
+                            .all(|i| fill_array.slice(i, 1).as_ref() == 
first.as_ref())
+                    })
+            }
+        };
+
+    // Fast path: at least one row needs to grow and all rows share
+    // the same fill value.
+    if is_uniform_fill {

Review Comment:
   The naming is a bit confusing: `is_uniform_fill` is false if the fill value 
is uniform but there are no rows that need to grow. How about `use_batch_fill` 
or `use_bulk_fill` or similar?



##########
datafusion/functions-nested/src/resize.rs:
##########
@@ -206,7 +206,103 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
     let values = array.values();
     let original_data = values.to_data();
 
-    // create default element array
+    // Track the largest per-row growth so the uniform-fill fast path can
+    // materialize one reusable fill buffer of the required size.
+    let mut max_extra: usize = 0;
+    for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
+        if array.is_null(row_index) {
+            continue;
+        }
+        let target_count = 
count_array.value(row_index).to_usize().ok_or_else(|| {
+            internal_datafusion_err!("array_resize: failed to convert size to 
usize")
+        })?;
+        let current_len = (offset_window[1] - 
offset_window[0]).to_usize().unwrap();
+        if target_count > current_len {
+            let extra = target_count - current_len;
+            if extra > max_extra {
+                max_extra = extra;
+            }

Review Comment:
   ```suggestion
               max_extra = max_extra.max(extra);
   ```



##########
datafusion/functions-nested/src/resize.rs:
##########
@@ -206,7 +206,103 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
     let values = array.values();
     let original_data = values.to_data();
 
-    // create default element array
+    // Track the largest per-row growth so the uniform-fill fast path can
+    // materialize one reusable fill buffer of the required size.
+    let mut max_extra: usize = 0;
+    for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
+        if array.is_null(row_index) {
+            continue;
+        }
+        let target_count = 
count_array.value(row_index).to_usize().ok_or_else(|| {
+            internal_datafusion_err!("array_resize: failed to convert size to 
usize")
+        })?;
+        let current_len = (offset_window[1] - 
offset_window[0]).to_usize().unwrap();
+        if target_count > current_len {
+            let extra = target_count - current_len;
+            if extra > max_extra {
+                max_extra = extra;
+            }
+        }
+    }
+
+    // The fast path is valid when at least one row grows and every row would
+    // use the same fill value.
+    let is_uniform_fill = max_extra > 0
+        && match &default_element {
+            None => true,
+            Some(fill_array) => {
+                let len = fill_array.len();
+                let null_count = fill_array.logical_null_count();
+
+                len <= 1
+                    || null_count == len
+                    || (null_count == 0 && {
+                        let first = fill_array.slice(0, 1);
+                        (1..len)
+                            .all(|i| fill_array.slice(i, 1).as_ref() == 
first.as_ref())
+                    })
+            }
+        };
+
+    // Fast path: at least one row needs to grow and all rows share
+    // the same fill value.
+    if is_uniform_fill {
+        let fill_scalar = match &default_element {
+            None => ScalarValue::try_from(&data_type)?,
+            Some(fill_array) if fill_array.logical_null_count() == 
fill_array.len() => {
+                ScalarValue::try_from(&data_type)?
+            }
+            Some(fill_array) => 
ScalarValue::try_from_array(fill_array.as_ref(), 0)?,
+        };
+        let default_element = fill_scalar.to_array_of_size(max_extra)?;
+        let default_value_data = default_element.to_data();
+
+        let capacity = Capacities::Array(original_data.len() + 
default_value_data.len());

Review Comment:
   Is this right? `default_value_data.len()` is the per-row growth, I think we 
want the total estimated output size.



##########
datafusion/functions-nested/src/resize.rs:
##########
@@ -206,7 +206,103 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
     let values = array.values();
     let original_data = values.to_data();
 
-    // create default element array
+    // Track the largest per-row growth so the uniform-fill fast path can
+    // materialize one reusable fill buffer of the required size.
+    let mut max_extra: usize = 0;
+    for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
+        if array.is_null(row_index) {
+            continue;
+        }
+        let target_count = 
count_array.value(row_index).to_usize().ok_or_else(|| {
+            internal_datafusion_err!("array_resize: failed to convert size to 
usize")
+        })?;
+        let current_len = (offset_window[1] - 
offset_window[0]).to_usize().unwrap();
+        if target_count > current_len {
+            let extra = target_count - current_len;
+            if extra > max_extra {
+                max_extra = extra;
+            }
+        }
+    }
+
+    // The fast path is valid when at least one row grows and every row would
+    // use the same fill value.
+    let is_uniform_fill = max_extra > 0
+        && match &default_element {
+            None => true,
+            Some(fill_array) => {
+                let len = fill_array.len();
+                let null_count = fill_array.logical_null_count();
+
+                len <= 1
+                    || null_count == len
+                    || (null_count == 0 && {
+                        let first = fill_array.slice(0, 1);
+                        (1..len)
+                            .all(|i| fill_array.slice(i, 1).as_ref() == 
first.as_ref())
+                    })
+            }
+        };
+
+    // Fast path: at least one row needs to grow and all rows share
+    // the same fill value.
+    if is_uniform_fill {
+        let fill_scalar = match &default_element {
+            None => ScalarValue::try_from(&data_type)?,
+            Some(fill_array) if fill_array.logical_null_count() == 
fill_array.len() => {
+                ScalarValue::try_from(&data_type)?
+            }
+            Some(fill_array) => 
ScalarValue::try_from_array(fill_array.as_ref(), 0)?,
+        };
+        let default_element = fill_scalar.to_array_of_size(max_extra)?;
+        let default_value_data = default_element.to_data();
+
+        let capacity = Capacities::Array(original_data.len() + 
default_value_data.len());
+        let mut offsets = vec![O::usize_as(0)];
+        let mut mutable = MutableArrayData::with_capacities(
+            vec![&original_data, &default_value_data],
+            false,
+            capacity,
+        );
+
+        let mut null_builder = NullBufferBuilder::new(array.len());
+
+        for (row_index, offset_window) in 
array.offsets().windows(2).enumerate() {

Review Comment:
   The fast path and slow path are doing very similar work and there's a bunch 
of duplicate code. I wonder if it would be possible to refactor them -- the key 
difference is just how the fill itself is done, no?



##########
datafusion/functions-nested/src/resize.rs:
##########
@@ -206,7 +206,103 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
     let values = array.values();
     let original_data = values.to_data();
 
-    // create default element array
+    // Track the largest per-row growth so the uniform-fill fast path can
+    // materialize one reusable fill buffer of the required size.
+    let mut max_extra: usize = 0;
+    for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
+        if array.is_null(row_index) {
+            continue;
+        }
+        let target_count = 
count_array.value(row_index).to_usize().ok_or_else(|| {
+            internal_datafusion_err!("array_resize: failed to convert size to 
usize")
+        })?;
+        let current_len = (offset_window[1] - 
offset_window[0]).to_usize().unwrap();
+        if target_count > current_len {
+            let extra = target_count - current_len;
+            if extra > max_extra {
+                max_extra = extra;
+            }
+        }
+    }
+
+    // The fast path is valid when at least one row grows and every row would
+    // use the same fill value.
+    let is_uniform_fill = max_extra > 0
+        && match &default_element {
+            None => true,
+            Some(fill_array) => {
+                let len = fill_array.len();
+                let null_count = fill_array.logical_null_count();
+
+                len <= 1
+                    || null_count == len
+                    || (null_count == 0 && {
+                        let first = fill_array.slice(0, 1);
+                        (1..len)
+                            .all(|i| fill_array.slice(i, 1).as_ref() == 
first.as_ref())
+                    })
+            }
+        };
+
+    // Fast path: at least one row needs to grow and all rows share
+    // the same fill value.
+    if is_uniform_fill {
+        let fill_scalar = match &default_element {
+            None => ScalarValue::try_from(&data_type)?,
+            Some(fill_array) if fill_array.logical_null_count() == 
fill_array.len() => {
+                ScalarValue::try_from(&data_type)?
+            }
+            Some(fill_array) => 
ScalarValue::try_from_array(fill_array.as_ref(), 0)?,
+        };
+        let default_element = fill_scalar.to_array_of_size(max_extra)?;
+        let default_value_data = default_element.to_data();
+
+        let capacity = Capacities::Array(original_data.len() + 
default_value_data.len());
+        let mut offsets = vec![O::usize_as(0)];
+        let mut mutable = MutableArrayData::with_capacities(
+            vec![&original_data, &default_value_data],
+            false,
+            capacity,
+        );
+
+        let mut null_builder = NullBufferBuilder::new(array.len());
+
+        for (row_index, offset_window) in 
array.offsets().windows(2).enumerate() {
+            if array.is_null(row_index) {
+                null_builder.append_null();
+                offsets.push(offsets[row_index]);
+                continue;
+            }
+            null_builder.append_non_null();
+
+            let count = count_array.value(row_index).to_usize().ok_or_else(|| {
+                internal_datafusion_err!("array_resize: failed to convert size 
to usize")
+            })?;
+            let count = O::usize_as(count);
+            let start = offset_window[0];
+            if start + count > offset_window[1] {
+                let extra_count = (start + count - 
offset_window[1]).to_usize().unwrap();
+                let end = offset_window[1];
+                mutable.extend(0, (start).to_usize().unwrap(), 
(end).to_usize().unwrap());

Review Comment:
   (Minor) Why `(start)`? Parens unnecessary, here and below.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to