This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 8beeab2769 perf: improve calculating length performance for view byte 
array in row conversion (#9080)
8beeab2769 is described below

commit 8beeab276940372211cca08a68cb74458c2ddbd2
Author: Raz Luvaton <[email protected]>
AuthorDate: Tue Jan 13 14:15:52 2026 +0200

    perf: improve calculating length performance for view byte array in row 
conversion (#9080)
    
    # Which issue does this PR close?
    
    N/A
    
    # Rationale for this change
    
    Making the row length calculation faster which result in faster row
    conversion
    
    # What changes are included in this PR?
    
    1. Instead of iterating over the bytes and getting the length from the
    byte slice, we use the offsets directly, this is faster as it saves us
    going to the buffer
    2. Added new API for `GenericByteViewArray` (explained below)
    
    # Are these changes tested?
    
    Yes
    
    # Are there any user-facing changes?
    
    Yes, added `lengths` function to `GenericByteViewArray` to get an
    iterator over the lengths of the items in the array
    
    -----
    
    Related to:
    - #9078
    - #9079
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-array/src/array/byte_view_array.rs | 135 ++++++++++++++++++++++++++++++-
 arrow-row/src/lib.rs                     |  42 +++++++---
 2 files changed, 165 insertions(+), 12 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index 54a9a94324..ca8ddfbe2a 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -438,6 +438,26 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         })
     }
 
+    /// Return an iterator over the length of each array element, including 
null values.
+    ///
+    /// Null values length would equal to the underlying bytes length and NOT 0
+    ///
+    /// Example of getting 0 for null values
+    /// ```rust
+    /// # use arrow_array::StringViewArray;
+    /// # use arrow_array::Array;
+    /// use arrow_data::ByteView;
+    ///
+    /// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl 
Iterator<Item = u32> {
+    ///     view.lengths()
+    ///         .enumerate()
+    ///         .map(|(index, length)| if view.is_null(index) { 0 } else { 
length })
+    /// }
+    /// ```
+    pub fn lengths(&self) -> impl ExactSizeIterator<Item = u32> + Clone {
+        self.views().iter().map(|v| *v as u32)
+    }
+
     /// Returns a zero-copy slice of this array with the indicated offset and 
length.
     pub fn slice(&self, offset: usize, length: usize) -> Self {
         Self {
@@ -1184,7 +1204,7 @@ mod tests {
     use crate::{
         Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, 
StringViewArray,
     };
-    use arrow_buffer::{Buffer, ScalarBuffer};
+    use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
     use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
     use rand::prelude::StdRng;
     use rand::{Rng, SeedableRng};
@@ -1681,4 +1701,117 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn empty_array_should_return_empty_lengths_iterator() {
+        let empty = 
GenericByteViewArray::<BinaryViewType>::from(Vec::<&[u8]>::new());
+
+        let mut lengths_iter = empty.lengths();
+        assert_eq!(lengths_iter.len(), 0);
+        assert_eq!(lengths_iter.next(), None);
+    }
+
+    #[test]
+    fn 
array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() {
+        let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
+            // Not inlined as longer than 12 bytes
+            b"Supercalifragilisticexpialidocious" as &[u8],
+            // Inlined as shorter than 12 bytes
+            b"Hello",
+            // Empty value
+            b"",
+            // Exactly 12 bytes
+            b"abcdefghijkl",
+        ]);
+
+        let mut lengths_iter = cases.lengths();
+
+        assert_eq!(lengths_iter.len(), cases.len());
+
+        let cases_iter = cases.iter();
+
+        for case in cases_iter {
+            let case_value = case.unwrap();
+            let length = lengths_iter.next().expect("Should have a length");
+
+            assert_eq!(case_value.len(), length as usize);
+        }
+
+        assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
+    }
+
+    #[test]
+    fn array_lengths_should_return_the_underlying_length_for_null_values() {
+        let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
+            // Not inlined as longer than 12 bytes
+            b"Supercalifragilisticexpialidocious" as &[u8],
+            // Inlined as shorter than 12 bytes
+            b"Hello",
+            // Empty value
+            b"",
+            // Exactly 12 bytes
+            b"abcdefghijkl",
+        ]);
+
+        let (views, buffer, _) = cases.clone().into_parts();
+
+        // Keeping the values but just adding nulls on top
+        let cases_with_all_nulls = GenericByteViewArray::<BinaryViewType>::new(
+            views,
+            buffer,
+            Some(NullBuffer::new_null(cases.len())),
+        );
+
+        let lengths_iter = cases.lengths();
+        let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths();
+
+        assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len());
+
+        for expected_length in lengths_iter {
+            let actual_length = all_nulls_lengths_iter.next().expect("Should 
have a length");
+
+            assert_eq!(expected_length, actual_length);
+        }
+
+        assert_eq!(
+            all_nulls_lengths_iter.next(),
+            None,
+            "Should not have more lengths"
+        );
+    }
+
+    #[test]
+    fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() {
+        let array = GenericByteViewArray::<BinaryViewType>::from(vec![
+            b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
+            b"Hello",
+            b"something great",
+            b"is",
+            b"coming soon!",
+            b"when you find what it is",
+            b"let me know",
+            b"cause",
+            b"I",
+            b"have no idea",
+            b"what it",
+            b"is",
+        ]);
+
+        let sliced_array = array.slice(2, array.len() - 3);
+
+        let mut lengths_iter = sliced_array.lengths();
+
+        assert_eq!(lengths_iter.len(), sliced_array.len());
+
+        let values_iter = sliced_array.iter();
+
+        for value in values_iter {
+            let value = value.unwrap();
+            let length = lengths_iter.next().expect("Should have a length");
+
+            assert_eq!(value.len(), length as usize);
+        }
+
+        assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
+    }
 }
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index 2f6ffd76d9..28c65c5994 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -164,7 +164,7 @@ use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
 use arrow_array::cast::*;
-use arrow_array::types::ArrowDictionaryKeyType;
+use arrow_array::types::{ArrowDictionaryKeyType, ByteViewType};
 use arrow_array::*;
 use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder};
@@ -1555,11 +1555,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) 
-> LengthTracker {
                             .iter()
                             .map(|slice| variable::encoded_len(slice))
                     ),
-                    DataType::BinaryView => tracker.push_variable(
-                        array.as_binary_view()
-                            .iter()
-                            .map(|slice| variable::encoded_len(slice))
-                    ),
+                    DataType::BinaryView => push_byte_view_array_lengths(&mut 
tracker, array.as_binary_view()),
                     DataType::Utf8 => tracker.push_variable(
                         array.as_string::<i32>()
                             .iter()
@@ -1570,11 +1566,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) 
-> LengthTracker {
                             .iter()
                             .map(|slice| variable::encoded_len(slice.map(|x| 
x.as_bytes())))
                     ),
-                    DataType::Utf8View => tracker.push_variable(
-                        array.as_string_view()
-                            .iter()
-                            .map(|slice| variable::encoded_len(slice.map(|x| 
x.as_bytes())))
-                    ),
+                    DataType::Utf8View => push_byte_view_array_lengths(&mut 
tracker, array.as_string_view()),
                     DataType::FixedSizeBinary(len) => {
                         let len = len.to_usize().unwrap();
                         tracker.push_fixed(1 + len)
@@ -1664,6 +1656,34 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) 
-> LengthTracker {
     tracker
 }
 
+/// Add to [`LengthTracker`] the encoded length of each item in the 
[`GenericByteViewArray`]
+fn push_byte_view_array_lengths<T: ByteViewType>(
+    tracker: &mut LengthTracker,
+    array: &GenericByteViewArray<T>,
+) {
+    if let Some(nulls) = array.nulls().filter(|n| n.null_count() > 0) {
+        tracker.push_variable(
+            array
+                .lengths()
+                .zip(nulls.iter())
+                .map(|(length, is_valid)| {
+                    if is_valid {
+                        Some(length as usize)
+                    } else {
+                        None
+                    }
+                })
+                .map(variable::padded_length),
+        )
+    } else {
+        tracker.push_variable(
+            array
+                .lengths()
+                .map(|len| variable::padded_length(Some(len as usize))),
+        )
+    }
+}
+
 /// Encodes a column to the provided [`Rows`] incrementing the offsets as it 
progresses
 fn encode_column(
     data: &mut [u8],

Reply via email to