This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 7089786632 [Variant] Avoid collecting offset iterator (#7934)
7089786632 is described below

commit 7089786632b7bcec10c16b4b4aad0841a66d883a
Author: Yan Tingwang <tingwangyan2...@163.com>
AuthorDate: Thu Jul 17 21:56:39 2025 +0800

    [Variant] Avoid collecting offset iterator (#7934)
    
    # Which issue does this PR close?
    
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax.
    
    - Closes #7901 .
    
    # Rationale for this change
    
    Why are you proposing this change? If this is already explained clearly
    in the issue then this section is not needed.
    Explaining clearly why changes are proposed helps reviewers understand
    your changes and offer better suggestions for fixes.
    
    # What changes are included in this PR?
    
    There is no need to duplicate the description in the issue here but it
    is sometimes worth providing a summary of the individual changes in this
    PR.
    
    # Are these changes tested?
    
    We typically require tests for all PRs in order to:
    1. Prevent the code from being accidentally broken by subsequent changes
    2. Serve as another way to document the expected behavior of the code
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    
    # Are there any user-facing changes?
    
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    
    If there are any breaking changes to public APIs, please call them out.
    
    ---------
    
    Signed-off-by: codephage2020 <tingwangyan2...@163.com>
---
 parquet-variant/src/variant/metadata.rs | 58 ++++++++++++++++++++-------------
 parquet-variant/src/variant/object.rs   | 58 ++++++++++++++++++++-------------
 2 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/parquet-variant/src/variant/metadata.rs 
b/parquet-variant/src/variant/metadata.rs
index add31465d2..c75f232aa7 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -234,32 +234,44 @@ impl<'m> VariantMetadata<'m> {
                 self.header.first_offset_byte() as _..self.first_value_byte as 
_,
             )?;
 
-            let offsets =
-                map_bytes_to_offsets(offset_bytes, 
self.header.offset_size).collect::<Vec<_>>();
-
             // Verify the string values in the dictionary are UTF-8 encoded 
strings.
             let value_buffer =
                 string_from_slice(self.bytes, 0, self.first_value_byte as 
_..self.bytes.len())?;
 
+            let mut offsets_iter = map_bytes_to_offsets(offset_bytes, 
self.header.offset_size);
+            let mut current_offset = offsets_iter.next().unwrap_or(0);
+
             if self.header.is_sorted {
                 // Validate the dictionary values are unique and 
lexicographically sorted
                 //
                 // Since we use the offsets to access dictionary values, this 
also validates
                 // offsets are in-bounds and monotonically increasing
-                let are_dictionary_values_unique_and_sorted = 
(1..offsets.len())
-                    .map(|i| {
-                        let field_range = offsets[i - 1]..offsets[i];
-                        value_buffer.get(field_range)
-                    })
-                    .is_sorted_by(|a, b| match (a, b) {
-                        (Some(a), Some(b)) => a < b,
-                        _ => false,
-                    });
-
-                if !are_dictionary_values_unique_and_sorted {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "dictionary values are not unique and 
ordered".to_string(),
-                    ));
+                let mut prev_value: Option<&str> = None;
+
+                for next_offset in offsets_iter {
+                    if next_offset <= current_offset {
+                        return Err(ArrowError::InvalidArgumentError(
+                            "offsets not monotonically increasing".to_string(),
+                        ));
+                    }
+
+                    let current_value =
+                        value_buffer
+                            .get(current_offset..next_offset)
+                            .ok_or_else(|| {
+                                ArrowError::InvalidArgumentError("offset out 
of bounds".to_string())
+                            })?;
+
+                    if let Some(prev_val) = prev_value {
+                        if current_value <= prev_val {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "dictionary values are not unique and 
ordered".to_string(),
+                            ));
+                        }
+                    }
+
+                    prev_value = Some(current_value);
+                    current_offset = next_offset;
                 }
             } else {
                 // Validate offsets are in-bounds and monotonically increasing
@@ -267,11 +279,13 @@ impl<'m> VariantMetadata<'m> {
                 // Since shallow validation ensures the first and last offsets 
are in bounds,
                 // we can also verify all offsets are in-bounds by checking if
                 // offsets are monotonically increasing
-                let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
-                if !are_offsets_monotonic {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "offsets not monotonically increasing".to_string(),
-                    ));
+                for next_offset in offsets_iter {
+                    if next_offset <= current_offset {
+                        return Err(ArrowError::InvalidArgumentError(
+                            "offsets not monotonically increasing".to_string(),
+                        ));
+                    }
+                    current_offset = next_offset;
                 }
             }
 
diff --git a/parquet-variant/src/variant/object.rs 
b/parquet-variant/src/variant/object.rs
index 37ebce818d..50094cb39d 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -217,23 +217,31 @@ impl<'m, 'v> VariantObject<'m, 'v> {
                 self.header.field_ids_start_byte() as 
_..self.first_field_offset_byte as _,
             )?;
 
-            let field_ids = map_bytes_to_offsets(field_id_buffer, 
self.header.field_id_size)
-                .collect::<Vec<_>>();
-
+            let mut field_ids_iter =
+                map_bytes_to_offsets(field_id_buffer, 
self.header.field_id_size);
             // Validate all field ids exist in the metadata dictionary and the 
corresponding field names are lexicographically sorted
             if self.metadata.is_sorted() {
                 // Since the metadata dictionary has unique and sorted field 
names, we can also guarantee this object's field names
                 // are lexicographically sorted by their field id ordering
-                if !field_ids.is_sorted() {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "field names not sorted".to_string(),
-                    ));
-                }
+                let dictionary_size = self.metadata.dictionary_size();
+
+                if let Some(mut current_id) = field_ids_iter.next() {
+                    for next_id in field_ids_iter {
+                        if current_id >= dictionary_size {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "field id is not valid".to_string(),
+                            ));
+                        }
+
+                        if next_id <= current_id {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "field names not sorted".to_string(),
+                            ));
+                        }
+                        current_id = next_id;
+                    }
 
-                // Since field ids are sorted, if the last field is smaller 
than the dictionary size,
-                // we also know all field ids are smaller than the dictionary 
size and in-bounds.
-                if let Some(&last_field_id) = field_ids.last() {
-                    if last_field_id >= self.metadata.dictionary_size() {
+                    if current_id >= dictionary_size {
                         return Err(ArrowError::InvalidArgumentError(
                             "field id is not valid".to_string(),
                         ));
@@ -244,16 +252,22 @@ impl<'m, 'v> VariantObject<'m, 'v> {
                 // to check lexicographical order
                 //
                 // Since we are probing the metadata dictionary by field id, 
this also verifies field ids are in-bounds
-                let are_field_names_sorted = field_ids
-                    .iter()
-                    .map(|&i| self.metadata.get(i))
-                    .collect::<Result<Vec<_>, _>>()?
-                    .is_sorted();
-
-                if !are_field_names_sorted {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "field names not sorted".to_string(),
-                    ));
+                let mut current_field_name = match field_ids_iter.next() {
+                    Some(field_id) => Some(self.metadata.get(field_id)?),
+                    None => None,
+                };
+
+                for field_id in field_ids_iter {
+                    let next_field_name = self.metadata.get(field_id)?;
+
+                    if let Some(current_name) = current_field_name {
+                        if next_field_name <= current_name {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "field names not sorted".to_string(),
+                            ));
+                        }
+                    }
+                    current_field_name = Some(next_field_name);
                 }
             }
 

Reply via email to