alamb commented on code in PR #7666:
URL: https://github.com/apache/arrow-rs/pull/7666#discussion_r2150508053


##########
parquet-variant/src/variant.rs:
##########
@@ -300,62 +303,218 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: &'m VariantMetadata<'m>,
-    pub value: &'v [u8],
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObjectHeader {

Review Comment:
   I think this could be `pub(crate)` rather than `pub`



##########
parquet-variant/src/variant.rs:
##########
@@ -717,4 +980,257 @@ mod tests {
             "unexpected error: {err:?}"
         );
     }
+

Review Comment:
   Some other things to test here would be:
   1. objects that have "is_large" set (aka have more than 256 distinct field 
names)



##########
parquet-variant/src/variant.rs:
##########
@@ -300,62 +303,218 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: &'m VariantMetadata<'m>,
-    pub value: &'v [u8],
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObjectHeader {
+    field_offset_size: OffsetSizeBytes,
+    field_id_size: OffsetSizeBytes,
+    num_elements: usize,
+    field_ids_start_byte: usize,
+    field_offsets_start_byte: usize,
+    values_start_byte: usize,
 }
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+
+impl VariantObjectHeader {
+    pub fn try_new(value: &[u8]) -> Result<Self, ArrowError> {
+        // Parse the header byte to get object parameters
+        let header = first_byte_from_slice(value)?;
+        let value_header = header >> 2;
+
+        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
+        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
+        let is_large = value_header & 0x10; // 5th bit
+
+        let field_offset_size = 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
+        let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?;
+
+        // Determine num_elements size based on is_large flag
+        let num_elements_size = if is_large != 0 {
+            OffsetSizeBytes::Four
+        } else {
+            OffsetSizeBytes::One
+        };
+
+        // Parse num_elements
+        let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
+
+        // Calculate byte offsets for different sections
+        let field_ids_start_byte = 1 + num_elements_size as usize;
+        let field_offsets_start_byte = field_ids_start_byte + num_elements * 
field_id_size as usize;
+        let values_start_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+
+        // Verify that the last field offset array entry is inside the value 
slice
+        let last_field_offset_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+        if last_field_offset_byte > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset array entry at offset {} with length {} is 
outside the value slice of length {}",
+                last_field_offset_byte,
+                field_offset_size as usize,
+                value.len()
+            )));
+        }
+
+        // Verify that the value of the last field offset array entry fits 
inside the value slice
+        let last_field_offset =
+            field_offset_size.unpack_usize(value, field_offsets_start_byte, 
num_elements)?;
+        if values_start_byte + last_field_offset > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset value {} at offset {} is outside the value 
slice of length {}",
+                last_field_offset,
+                values_start_byte,
+                value.len()
+            )));
+        }
+        Ok(Self {
+            field_offset_size,
+            field_id_size,
+            num_elements,
+            field_ids_start_byte,
+            field_offsets_start_byte,
+            values_start_byte,
+        })
     }
-    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
-        todo!()
+
+    /// Returns the number of key-value pairs in this object
+    pub fn num_elements(&self) -> usize {
+        self.num_elements
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantArray<'m, 'v> {
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
     pub metadata: &'m VariantMetadata<'m>,
     pub value: &'v [u8],

Review Comment:
   It wasn't added in this PR but I think we should probably make these fields 
non public (we can add accessors or something) so we can 
   1. Hint people use the nicer APIs
   2. Potentially change the implementation if needed
   
   We can do this in some other PR too, I just wanted to point this out



##########
parquet-variant/src/variant.rs:
##########
@@ -300,62 +303,218 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: &'m VariantMetadata<'m>,
-    pub value: &'v [u8],
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObjectHeader {
+    field_offset_size: OffsetSizeBytes,
+    field_id_size: OffsetSizeBytes,
+    num_elements: usize,
+    field_ids_start_byte: usize,
+    field_offsets_start_byte: usize,
+    values_start_byte: usize,
 }
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+
+impl VariantObjectHeader {
+    pub fn try_new(value: &[u8]) -> Result<Self, ArrowError> {
+        // Parse the header byte to get object parameters
+        let header = first_byte_from_slice(value)?;
+        let value_header = header >> 2;
+
+        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
+        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
+        let is_large = value_header & 0x10; // 5th bit
+
+        let field_offset_size = 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
+        let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?;
+
+        // Determine num_elements size based on is_large flag
+        let num_elements_size = if is_large != 0 {
+            OffsetSizeBytes::Four
+        } else {
+            OffsetSizeBytes::One
+        };
+
+        // Parse num_elements
+        let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
+
+        // Calculate byte offsets for different sections
+        let field_ids_start_byte = 1 + num_elements_size as usize;
+        let field_offsets_start_byte = field_ids_start_byte + num_elements * 
field_id_size as usize;
+        let values_start_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+
+        // Verify that the last field offset array entry is inside the value 
slice
+        let last_field_offset_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+        if last_field_offset_byte > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset array entry at offset {} with length {} is 
outside the value slice of length {}",
+                last_field_offset_byte,
+                field_offset_size as usize,
+                value.len()
+            )));
+        }
+
+        // Verify that the value of the last field offset array entry fits 
inside the value slice
+        let last_field_offset =
+            field_offset_size.unpack_usize(value, field_offsets_start_byte, 
num_elements)?;
+        if values_start_byte + last_field_offset > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset value {} at offset {} is outside the value 
slice of length {}",
+                last_field_offset,
+                values_start_byte,
+                value.len()
+            )));
+        }
+        Ok(Self {
+            field_offset_size,
+            field_id_size,
+            num_elements,
+            field_ids_start_byte,
+            field_offsets_start_byte,
+            values_start_byte,
+        })
     }
-    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
-        todo!()
+
+    /// Returns the number of key-value pairs in this object
+    pub fn num_elements(&self) -> usize {
+        self.num_elements
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantArray<'m, 'v> {
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
     pub metadata: &'m VariantMetadata<'m>,
     pub value: &'v [u8],
+    header: VariantObjectHeader,
 }
 
-impl<'m, 'v> VariantArray<'m, 'v> {
-    /// Return the length of this array
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn try_new(metadata: &'m VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        Ok(Self {
+            metadata,
+            value,
+            header: VariantObjectHeader::try_new(value)?,
+        })
+    }
+
+    /// Returns the number of key-value pairs in this object
     pub fn len(&self) -> usize {
-        todo!()
+        self.header.num_elements()
     }
 
-    /// Is the array of zero length
+    /// Returns true if the object contains no key-value pairs
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
 
-    pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, 
ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        let field_list = self.parse_field_list()?;
+        Ok(field_list.into_iter())
     }
 
-    pub fn get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+    pub fn field(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
+        // Binary search through the sorted field IDs to find the field
+        let (field_ids, field_offsets) = self.parse_field_arrays()?;

Review Comment:
   We can probably optimize this code in a future PR -- I don't think we need 
to create a whole Vec<..> just to search
   
   Maybe we can implement a `OffsetSize::try_binary_search_by` type method that 
directly computes the offsets during the search.
   
   Similarly, we could also add a `OffsetSize::try_linear_search_by` method 
that directly does the linear search when the dictionary is not sorted 



##########
parquet-variant/src/variant.rs:
##########
@@ -300,62 +303,218 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: &'m VariantMetadata<'m>,
-    pub value: &'v [u8],
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObjectHeader {
+    field_offset_size: OffsetSizeBytes,
+    field_id_size: OffsetSizeBytes,
+    num_elements: usize,
+    field_ids_start_byte: usize,
+    field_offsets_start_byte: usize,
+    values_start_byte: usize,
 }
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+
+impl VariantObjectHeader {
+    pub fn try_new(value: &[u8]) -> Result<Self, ArrowError> {
+        // Parse the header byte to get object parameters
+        let header = first_byte_from_slice(value)?;
+        let value_header = header >> 2;
+
+        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
+        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
+        let is_large = value_header & 0x10; // 5th bit
+
+        let field_offset_size = 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
+        let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?;
+
+        // Determine num_elements size based on is_large flag
+        let num_elements_size = if is_large != 0 {
+            OffsetSizeBytes::Four
+        } else {
+            OffsetSizeBytes::One
+        };
+
+        // Parse num_elements
+        let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
+
+        // Calculate byte offsets for different sections
+        let field_ids_start_byte = 1 + num_elements_size as usize;
+        let field_offsets_start_byte = field_ids_start_byte + num_elements * 
field_id_size as usize;
+        let values_start_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+
+        // Verify that the last field offset array entry is inside the value 
slice
+        let last_field_offset_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+        if last_field_offset_byte > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset array entry at offset {} with length {} is 
outside the value slice of length {}",
+                last_field_offset_byte,
+                field_offset_size as usize,
+                value.len()
+            )));
+        }
+
+        // Verify that the value of the last field offset array entry fits 
inside the value slice
+        let last_field_offset =
+            field_offset_size.unpack_usize(value, field_offsets_start_byte, 
num_elements)?;
+        if values_start_byte + last_field_offset > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset value {} at offset {} is outside the value 
slice of length {}",
+                last_field_offset,
+                values_start_byte,
+                value.len()
+            )));
+        }
+        Ok(Self {
+            field_offset_size,
+            field_id_size,
+            num_elements,
+            field_ids_start_byte,
+            field_offsets_start_byte,
+            values_start_byte,
+        })
     }
-    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
-        todo!()
+
+    /// Returns the number of key-value pairs in this object
+    pub fn num_elements(&self) -> usize {
+        self.num_elements
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantArray<'m, 'v> {
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
     pub metadata: &'m VariantMetadata<'m>,
     pub value: &'v [u8],
+    header: VariantObjectHeader,
 }
 
-impl<'m, 'v> VariantArray<'m, 'v> {
-    /// Return the length of this array
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn try_new(metadata: &'m VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        Ok(Self {
+            metadata,
+            value,
+            header: VariantObjectHeader::try_new(value)?,
+        })
+    }
+
+    /// Returns the number of key-value pairs in this object
     pub fn len(&self) -> usize {
-        todo!()
+        self.header.num_elements()
     }
 
-    /// Is the array of zero length
+    /// Returns true if the object contains no key-value pairs
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
 
-    pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, 
ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        let field_list = self.parse_field_list()?;
+        Ok(field_list.into_iter())
     }
 
-    pub fn get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+    pub fn field(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
+        // Binary search through the sorted field IDs to find the field
+        let (field_ids, field_offsets) = self.parse_field_arrays()?;
+        let search_result = try_binary_search_by(&field_ids, &name, 
|&field_id| {

Review Comment:
   I think it is only correct to use binary search for the field name if the 
metadata has the fields sorted. 
   
   Perhaps for now we can just update this PR to return a NotYetImplemented 
error if the dictionary is not sorted. 
   
   ```suggestion
           if !self.metadata.is_sorted() {
               return Err(ArrowError::NotYetImplemented(
                   "Cannot search for fields in an unsorted 
VariantObject".to_string(),
               ));
           }
           let search_result = try_binary_search_by(&field_ids, &name, 
|&field_id| {



##########
parquet-variant/src/variant.rs:
##########
@@ -300,62 +303,218 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: &'m VariantMetadata<'m>,
-    pub value: &'v [u8],
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObjectHeader {
+    field_offset_size: OffsetSizeBytes,
+    field_id_size: OffsetSizeBytes,
+    num_elements: usize,
+    field_ids_start_byte: usize,
+    field_offsets_start_byte: usize,
+    values_start_byte: usize,
 }
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+
+impl VariantObjectHeader {
+    pub fn try_new(value: &[u8]) -> Result<Self, ArrowError> {
+        // Parse the header byte to get object parameters
+        let header = first_byte_from_slice(value)?;
+        let value_header = header >> 2;
+
+        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
+        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
+        let is_large = value_header & 0x10; // 5th bit
+
+        let field_offset_size = 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
+        let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?;
+
+        // Determine num_elements size based on is_large flag
+        let num_elements_size = if is_large != 0 {
+            OffsetSizeBytes::Four
+        } else {
+            OffsetSizeBytes::One
+        };
+
+        // Parse num_elements
+        let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
+
+        // Calculate byte offsets for different sections
+        let field_ids_start_byte = 1 + num_elements_size as usize;
+        let field_offsets_start_byte = field_ids_start_byte + num_elements * 
field_id_size as usize;
+        let values_start_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+
+        // Verify that the last field offset array entry is inside the value 
slice
+        let last_field_offset_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+        if last_field_offset_byte > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(

Review Comment:
   it would be great eventually to cover these error cases with tests too (aka 
verify invalid inputs). I don't think it is needed for this PR



##########
parquet-variant/src/variant.rs:
##########
@@ -374,30 +533,134 @@ impl<'m, 'v> VariantArray<'m, 'v> {
             .checked_add(value_bytes)
             .ok_or_else(overflow)?;
 
+        // Verify that the last offset array entry is inside the value slice
+        let last_offset_byte = first_offset_byte + n_offsets * offset_size as 
usize;
+        if last_offset_byte > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last offset array entry at offset {} with length {} is 
outside the value slice of length {}",
+                last_offset_byte,
+                offset_size as usize,
+                value.len()
+            )));
+        }
+
+        // Verify that the value of the last offset array entry fits inside 
the value slice
+        let last_offset = offset_size.unpack_usize(value, first_offset_byte, 
num_elements)?;
+        if first_value_byte + last_offset > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last offset value {} at offset {} is outside the value slice 
of length {}",
+                last_offset,
+                first_value_byte,
+                value.len()
+            )));
+        }
+
+        Ok(Self {
+            offset_size,
+            is_large,
+            num_elements,
+            first_offset_byte,
+            first_value_byte,
+        })
+    }
+
+    /// Returns the number of elements in this list
+    pub fn num_elements(&self) -> usize {
+        self.num_elements
+    }
+
+    /// Returns the offset size in bytes
+    pub fn offset_size(&self) -> usize {
+        self.offset_size as _
+    }
+
+    /// Returns whether this is a large list
+    pub fn is_large(&self) -> bool {
+        self.is_large
+    }
+
+    /// Returns the byte offset where the offset array starts
+    pub fn first_offset_byte(&self) -> usize {
+        self.first_offset_byte
+    }
+
+    /// Returns the byte offset where the values start
+    pub fn first_value_byte(&self) -> usize {
+        self.first_value_byte
+    }
+}
+
+// NOTE: We differ from the variant spec and call it "list" instead of "array" 
in order to be
+// consistent with parquet and arrow type naming. Otherwise, the name would 
conflict with the
+// `VariantArray : Array` we must eventually define for variant-typed arrow 
arrays.

Review Comment:
   ```suggestion
   /// Represents an Variant `Array`
   ///
   /// NOTE: The `List` naming differs from the variant spec, which uses 
"array" in order to be
   /// consistent with parquet and arrow type naming. Otherwise, the name would 
conflict with the
   /// `VariantArray : Array` we must eventually define for variant-typed arrow 
arrays.
   ```



##########
parquet-variant/src/variant.rs:
##########
@@ -300,62 +303,218 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: &'m VariantMetadata<'m>,
-    pub value: &'v [u8],
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObjectHeader {
+    field_offset_size: OffsetSizeBytes,
+    field_id_size: OffsetSizeBytes,
+    num_elements: usize,
+    field_ids_start_byte: usize,
+    field_offsets_start_byte: usize,
+    values_start_byte: usize,
 }
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+
+impl VariantObjectHeader {
+    pub fn try_new(value: &[u8]) -> Result<Self, ArrowError> {
+        // Parse the header byte to get object parameters
+        let header = first_byte_from_slice(value)?;
+        let value_header = header >> 2;
+
+        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
+        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
+        let is_large = value_header & 0x10; // 5th bit
+
+        let field_offset_size = 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
+        let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?;
+
+        // Determine num_elements size based on is_large flag
+        let num_elements_size = if is_large != 0 {
+            OffsetSizeBytes::Four
+        } else {
+            OffsetSizeBytes::One
+        };
+
+        // Parse num_elements
+        let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
+
+        // Calculate byte offsets for different sections
+        let field_ids_start_byte = 1 + num_elements_size as usize;
+        let field_offsets_start_byte = field_ids_start_byte + num_elements * 
field_id_size as usize;
+        let values_start_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+
+        // Verify that the last field offset array entry is inside the value 
slice
+        let last_field_offset_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+        if last_field_offset_byte > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset array entry at offset {} with length {} is 
outside the value slice of length {}",
+                last_field_offset_byte,
+                field_offset_size as usize,
+                value.len()
+            )));
+        }
+
+        // Verify that the value of the last field offset array entry fits 
inside the value slice
+        let last_field_offset =
+            field_offset_size.unpack_usize(value, field_offsets_start_byte, 
num_elements)?;
+        if values_start_byte + last_field_offset > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset value {} at offset {} is outside the value 
slice of length {}",
+                last_field_offset,
+                values_start_byte,
+                value.len()
+            )));
+        }
+        Ok(Self {
+            field_offset_size,
+            field_id_size,
+            num_elements,
+            field_ids_start_byte,
+            field_offsets_start_byte,
+            values_start_byte,
+        })
     }
-    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
-        todo!()
+
+    /// Returns the number of key-value pairs in this object
+    pub fn num_elements(&self) -> usize {
+        self.num_elements
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantArray<'m, 'v> {
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
     pub metadata: &'m VariantMetadata<'m>,
     pub value: &'v [u8],
+    header: VariantObjectHeader,

Review Comment:
   I think this is a good design -- parse / validate the relevant fields from 
the header once and then save them to be used in subsequent passes



##########
parquet-variant/src/variant.rs:
##########
@@ -377,30 +534,134 @@ impl<'m, 'v> VariantArray<'m, 'v> {
             .checked_add(value_bytes)
             .ok_or_else(overflow)?;
 
-        // Skip num_elements bytes to read the offsets
-        let start_field_offset_from_first_value_byte =
-            offset_size.unpack_usize(self.value_data, first_offset_byte, 
index)?;
-        let end_field_offset_from_first_value_byte =
-            offset_size.unpack_usize(self.value_data, first_offset_byte, index 
+ 1)?;
+        // Verify that the last offset array entry is inside the value slice
+        let last_offset_byte = first_offset_byte + n_offsets * offset_size as 
usize;
+        if last_offset_byte > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last offset array entry at offset {} with length {} is 
outside the value slice of length {}",
+                last_offset_byte,
+                offset_size as usize,
+                value.len()
+            )));
+        }
+
+        // Verify that the value of the last offset array entry fits inside 
the value slice
+        let last_offset = offset_size.unpack_usize(value, first_offset_byte, 
num_elements)?;
+        if first_value_byte + last_offset > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last offset value {} at offset {} is outside the value slice 
of length {}",
+                last_offset,
+                first_value_byte,
+                value.len()
+            )));
+        }
+
+        Ok(Self {
+            offset_size,
+            is_large,
+            num_elements,
+            first_offset_byte,
+            first_value_byte,
+        })
+    }
+
+    /// Returns the number of elements in this list
+    pub fn num_elements(&self) -> usize {
+        self.num_elements
+    }
+
+    /// Returns the offset size in bytes
+    pub fn offset_size(&self) -> usize {
+        self.offset_size as _

Review Comment:
   if we ever need to optimize the size of the `VariantObjectHeader` we can 
potentially just store the header byte and re-extract the appropriate bits



##########
parquet-variant/src/variant.rs:
##########
@@ -300,62 +303,218 @@ impl<'m> VariantMetadata<'m> {
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
-    pub metadata: &'m VariantMetadata<'m>,
-    pub value: &'v [u8],
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObjectHeader {
+    field_offset_size: OffsetSizeBytes,
+    field_id_size: OffsetSizeBytes,
+    num_elements: usize,
+    field_ids_start_byte: usize,
+    field_offsets_start_byte: usize,
+    values_start_byte: usize,
 }
-impl<'m, 'v> VariantObject<'m, 'v> {
-    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+
+impl VariantObjectHeader {
+    pub fn try_new(value: &[u8]) -> Result<Self, ArrowError> {
+        // Parse the header byte to get object parameters
+        let header = first_byte_from_slice(value)?;
+        let value_header = header >> 2;
+
+        let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
+        let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
+        let is_large = value_header & 0x10; // 5th bit
+
+        let field_offset_size = 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
+        let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?;
+
+        // Determine num_elements size based on is_large flag
+        let num_elements_size = if is_large != 0 {
+            OffsetSizeBytes::Four
+        } else {
+            OffsetSizeBytes::One
+        };
+
+        // Parse num_elements
+        let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
+
+        // Calculate byte offsets for different sections
+        let field_ids_start_byte = 1 + num_elements_size as usize;
+        let field_offsets_start_byte = field_ids_start_byte + num_elements * 
field_id_size as usize;
+        let values_start_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+
+        // Verify that the last field offset array entry is inside the value 
slice
+        let last_field_offset_byte =
+            field_offsets_start_byte + (num_elements + 1) * field_offset_size 
as usize;
+        if last_field_offset_byte > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset array entry at offset {} with length {} is 
outside the value slice of length {}",
+                last_field_offset_byte,
+                field_offset_size as usize,
+                value.len()
+            )));
+        }
+
+        // Verify that the value of the last field offset array entry fits 
inside the value slice
+        let last_field_offset =
+            field_offset_size.unpack_usize(value, field_offsets_start_byte, 
num_elements)?;
+        if values_start_byte + last_field_offset > value.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last field offset value {} at offset {} is outside the value 
slice of length {}",
+                last_field_offset,
+                values_start_byte,
+                value.len()
+            )));
+        }
+        Ok(Self {
+            field_offset_size,
+            field_id_size,
+            num_elements,
+            field_ids_start_byte,
+            field_offsets_start_byte,
+            values_start_byte,
+        })
     }
-    pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> 
{
-        todo!()
+
+    /// Returns the number of key-value pairs in this object
+    pub fn num_elements(&self) -> usize {
+        self.num_elements
     }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq)]
-pub struct VariantArray<'m, 'v> {
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantObject<'m, 'v> {
     pub metadata: &'m VariantMetadata<'m>,
     pub value: &'v [u8],
+    header: VariantObjectHeader,
 }
 
-impl<'m, 'v> VariantArray<'m, 'v> {
-    /// Return the length of this array
+impl<'m, 'v> VariantObject<'m, 'v> {
+    pub fn try_new(metadata: &'m VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        Ok(Self {
+            metadata,
+            value,
+            header: VariantObjectHeader::try_new(value)?,
+        })
+    }
+
+    /// Returns the number of key-value pairs in this object
     pub fn len(&self) -> usize {
-        todo!()
+        self.header.num_elements()
     }
 
-    /// Is the array of zero length
+    /// Returns true if the object contains no key-value pairs
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
 
-    pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, 
ArrowError> {
-        todo!();
-        #[allow(unreachable_code)] // Just to infer the return type
-        Ok(vec![].into_iter())
+    pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 
'v>)>, ArrowError> {
+        let field_list = self.parse_field_list()?;
+        Ok(field_list.into_iter())
     }
 
-    pub fn get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+    pub fn field(&self, name: &str) -> Result<Option<Variant<'m, 'v>>, 
ArrowError> {
+        // Binary search through the sorted field IDs to find the field
+        let (field_ids, field_offsets) = self.parse_field_arrays()?;
+        let search_result = try_binary_search_by(&field_ids, &name, 
|&field_id| {
+            self.metadata.get_field_by(field_id)
+        })?;
+
+        let Ok(index) = search_result else {
+            return Ok(None);
+        };
+        let start_offset = field_offsets[index];
+        let end_offset = field_offsets[index + 1];
+        let value_bytes = slice_from_slice(
+            self.value,
+            self.header.values_start_byte + start_offset
+                ..self.header.values_start_byte + end_offset,
+        )?;
+        let variant = Variant::try_new(self.metadata, value_bytes)?;
+        Ok(Some(variant))
+    }
+
+    /// Parse field IDs and field offsets arrays using the cached header
+    fn parse_field_arrays(&self) -> Result<(Vec<usize>, Vec<usize>), 
ArrowError> {
+        // Parse field IDs
+        let field_ids = (0..self.header.num_elements)
+            .map(|i| {
+                self.header.field_id_size.unpack_usize(
+                    self.value,
+                    self.header.field_ids_start_byte,
+                    i,
+                )
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+        debug_assert_eq!(field_ids.len(), self.header.num_elements);
+
+        // Parse field offsets (num_elements + 1 entries)
+        let field_offsets = (0..=self.header.num_elements)
+            .map(|i| {
+                self.header.field_offset_size.unpack_usize(
+                    self.value,
+                    self.header.field_offsets_start_byte,
+                    i,
+                )
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+        debug_assert_eq!(field_offsets.len(), self.header.num_elements + 1);
+
+        Ok((field_ids, field_offsets))
+    }
+
+    /// Parse all fields into a vector for iteration
+    fn parse_field_list(&self) -> Result<Vec<(&'m str, Variant<'m, 'v>)>, 
ArrowError> {
+        let (field_ids, field_offsets) = self.parse_field_arrays()?;
+
+        let mut fields = Vec::with_capacity(self.header.num_elements);
+
+        for i in 0..self.header.num_elements {
+            let field_id = field_ids[i];
+            let field_name = self.metadata.get_field_by(field_id)?;
+
+            let start_offset = field_offsets[i];
+            let end_offset = field_offsets[i + 1];
+            let value_bytes = slice_from_slice(
+                self.value,
+                self.header.values_start_byte + start_offset
+                    ..self.header.values_start_byte + end_offset,
+            )?;
+            let variant = Variant::try_new(self.metadata, value_bytes)?;
+
+            fields.push((field_name, variant));
+        }
+
+        Ok(fields)
+    }
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantListHeader {

Review Comment:
   I wonder if this needs to be `pub` or if it is ok for it just to be 
`pub(crate)`



##########
parquet-variant/tests/variant_interop.rs:
##########
@@ -99,7 +99,7 @@ fn variant_non_primitive() -> Result<(), ArrowError> {
                 assert_eq!(dict_val, "int_field");
             }
             "array_primitive" => match variant {
-                Variant::Array(arr) => {
+                Variant::List(arr) => {

Review Comment:
   I think it is important to make sure this code works for the pre-existing 
test data from `parqet-testing` -- I will try and make a PR to update these 
tests and verify this PR's implementation



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to