This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 13d497af2b perf: improve calculating length performance for nested
arrays in row conversion (#9079)
13d497af2b is described below
commit 13d497af2be5d91c1c57775b8a57487ca05babe3
Author: Raz Luvaton <[email protected]>
AuthorDate: Wed Jan 14 14:39:16 2026 +0200
perf: improve calculating length performance for nested arrays in row
conversion (#9079)
# Which issue does this PR close?
N/A
# Rationale for this change
Making the row length calculation faster which result in faster row
conversion
# What changes are included in this PR?
1. Instead of iterating over the rows and getting the length from the
byte slice, we use the offsets directly, this
2. Added 3 new APIs for `Rows` (explained below)
# Are these changes tested?
Yes
# Are there any user-facing changes?
Yes, added 3 functions to `Rows`:
- `row_len` - get the row length at index
- `row_len_unchecked` - get the row length at index without bound checks
- `lengths` - get iterator over the lengths of the rows
-----
Related to:
- #9078
- #9080
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-row/src/lib.rs | 67 ++++++++++++++++++++++++++++++++++++++++++++++++---
arrow-row/src/list.rs | 34 +++++++++++++-------------
arrow-row/src/run.rs | 4 +--
3 files changed, 82 insertions(+), 23 deletions(-)
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index 361b554414..1da0439ee9 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -161,6 +161,8 @@
#![warn(missing_docs)]
use std::cmp::Ordering;
use std::hash::{Hash, Hasher};
+use std::iter::Map;
+use std::slice::Windows;
use std::sync::Arc;
use arrow_array::cast::*;
@@ -1118,6 +1120,9 @@ pub struct Rows {
config: RowConfig,
}
+/// The iterator type for [`Rows::lengths`]
+pub type RowLengthIter<'a> = Map<Windows<'a, usize>, fn(&'a [usize]) -> usize>;
+
impl Rows {
/// Append a [`Row`] to this [`Rows`]
pub fn push(&mut self, row: Row<'_>) {
@@ -1156,6 +1161,19 @@ impl Rows {
}
}
+ /// Returns the number of bytes the row at index `row` is occupying,
+ /// that is, what is the length of the returned [`Row::data`] will be.
+ pub fn row_len(&self, row: usize) -> usize {
+ assert!(row + 1 < self.offsets.len());
+
+ self.offsets[row + 1] - self.offsets[row]
+ }
+
+ /// Get an iterator over the lengths of each row in this [`Rows`]
+ pub fn lengths(&self) -> RowLengthIter<'_> {
+ self.offsets.windows(2).map(|w| w[1] - w[0])
+ }
+
/// Sets the length of this [`Rows`] to 0
pub fn clear(&mut self) {
self.offsets.truncate(1);
@@ -1563,7 +1581,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder])
-> LengthTracker {
array => {
tracker.push_variable(
array.keys().iter().map(|v| match v {
- Some(k) => values.row(k.as_usize()).data.len(),
+ Some(k) => values.row_len(k.as_usize()),
None => null.data.len(),
})
)
@@ -1574,7 +1592,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder])
-> LengthTracker {
Encoder::Struct(rows, null) => {
let array = as_struct_array(array);
tracker.push_variable((0..array.len()).map(|idx| match
array.is_valid(idx) {
- true => 1 + rows.row(idx).as_ref().len(),
+ true => 1 + rows.row_len(idx),
false => 1 + null.data.len(),
}));
}
@@ -1626,10 +1644,10 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder])
-> LengthTracker {
let lengths = (0..union_array.len()).map(|i| {
let type_id = type_ids[i];
let child_row_i = offsets.as_ref().map(|o| o[i] as
usize).unwrap_or(i);
- let child_row = child_rows[type_id as
usize].row(child_row_i);
+ let child_row_len = child_rows[type_id as
usize].row_len(child_row_i);
// length: 1 byte type_id + child row bytes
- 1 + child_row.as_ref().len()
+ 1 + child_row_len
});
tracker.push_variable(lengths);
@@ -3699,6 +3717,38 @@ mod tests {
}
}
+ // Validate rows length iterator
+ {
+ let mut rows_iter = rows.iter();
+ let mut rows_lengths_iter = rows.lengths();
+ for (index, row) in rows_iter.by_ref().enumerate() {
+ let len = rows_lengths_iter
+ .next()
+ .expect("Reached end of length iterator while still
have rows");
+ assert_eq!(
+ row.data.len(),
+ len,
+ "Row length mismatch: {} vs {}",
+ row.data.len(),
+ len
+ );
+ assert_eq!(
+ len,
+ rows.row_len(index),
+ "Row length mismatch at index {}: {} vs {}",
+ index,
+ len,
+ rows.row_len(index)
+ );
+ }
+
+ assert_eq!(
+ rows_lengths_iter.next(),
+ None,
+ "Length iterator did not reach end"
+ );
+ }
+
// Convert rows produced from convert_columns().
// Note: validate_utf8 is set to false since Row is initialized
through empty_rows()
let back = converter.convert_rows(&rows).unwrap();
@@ -4351,4 +4401,13 @@ mod tests {
"Size should increase when reserving more space than previously
reserved"
);
}
+
+ #[test]
+ fn empty_rows_should_return_empty_lengths_iterator() {
+ let rows = RowConverter::new(vec![SortField::new(DataType::UInt8)])
+ .unwrap()
+ .empty_rows(0, 0);
+ let mut lengths_iter = rows.lengths();
+ assert_eq!(lengths_iter.next(), None);
+ }
}
diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs
index e04aa70c52..6e552b0a93 100644
--- a/arrow-row/src/list.rs
+++ b/arrow-row/src/list.rs
@@ -27,32 +27,32 @@ pub fn compute_lengths<O: OffsetSizeTrait>(
rows: &Rows,
array: &GenericListArray<O>,
) {
- let shift = array.value_offsets()[0].as_usize();
-
let offsets = array.value_offsets().windows(2);
+ let mut rows_length_iter = rows.lengths();
+
lengths
.iter_mut()
.zip(offsets)
.enumerate()
.for_each(|(idx, (length, offsets))| {
- let start = offsets[0].as_usize() - shift;
- let end = offsets[1].as_usize() - shift;
- let range = array.is_valid(idx).then_some(start..end);
- *length += encoded_len(rows, range);
+ let len = offsets[1].as_usize() - offsets[0].as_usize();
+ if array.is_valid(idx) {
+ *length += 1 + rows_length_iter
+ .by_ref()
+ .take(len)
+ .map(Some)
+ .map(super::variable::padded_length)
+ .sum::<usize>()
+ } else {
+ // Advance rows iterator by len
+ if len > 0 {
+ rows_length_iter.nth(len - 1);
+ }
+ *length += 1;
+ }
});
}
-fn encoded_len(rows: &Rows, range: Option<Range<usize>>) -> usize {
- match range {
- None => 1,
- Some(range) => {
- 1 + range
- .map(|i|
super::variable::padded_length(Some(rows.row(i).as_ref().len())))
- .sum::<usize>()
- }
- }
-}
-
/// Encodes the provided `GenericListArray` to `out` with the provided
`SortOptions`
///
/// `rows` should contain the encoded child elements
diff --git a/arrow-row/src/run.rs b/arrow-row/src/run.rs
index 24eaaa18e0..e12fa87dce 100644
--- a/arrow-row/src/run.rs
+++ b/arrow-row/src/run.rs
@@ -33,8 +33,8 @@ pub fn compute_lengths<R: RunEndIndexType>(
// Iterate over each run and apply the same length to all logical
positions in the run
for (physical_idx, &run_end) in run_ends.iter().enumerate() {
let logical_end = run_end.as_usize();
- let row = rows.row(physical_idx);
- let encoded_len = variable::encoded_len(Some(row.data));
+ let row_len = rows.row_len(physical_idx);
+ let encoded_len = variable::padded_length(Some(row_len));
// Add the same length for all logical positions in this run
for length in &mut lengths[logical_start..logical_end] {