This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1fdb3184c2 [Variant] Improve getter API for `VariantList` and
`VariantObject` (#7757)
1fdb3184c2 is described below
commit 1fdb3184c206c28417271763c2d8626f498ffd8a
Author: Matthew Kim <[email protected]>
AuthorDate: Fri Jun 27 06:35:00 2025 -0400
[Variant] Improve getter API for `VariantList` and `VariantObject` (#7757)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/7756
# Rationale for this change
Updates `VariantList` and `VariantObject` getter methods to follow
similar conventions to `std::Vec` and `std::HashMap`:
| Existing method | Proposed method |
|----------------|----------------|
| `VariantList::get(index) -> Result<Variant>` |
`VariantList::get(index) -> Option<Variant>` |
| `VariantObject::field_by_name(name) -> Result<Option<Variant>>` |
`VariantObject::get(name) -> Option<Variant>` |
| `VariantObject::field(i) -> Result<Variant>` |
`VariantObject::field(i) -> Option<Variant>` |
| `VariantObject::field_name(i) -> Result<Variant>` |
`VariantObject::field_name(i) -> Option<Variant>` |
One thing to note, however, the existing methods all returned
`Result<T>` since these getters are not only exposed as public API, but
also used for validation inside the constructor `try_new`.
Since the latter usage requires an error message, I chose to rename the
original fallible methods with an `_err` suffix and use them internally.
The new public methods now act as wrappers over these `_err` variants.
# Are there any user-facing changes?
New API (and docs with tests)
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet-variant/src/builder.rs | 12 +++---
parquet-variant/src/utils.rs | 20 +++++-----
parquet-variant/src/variant.rs | 2 +-
parquet-variant/src/variant/list.rs | 25 ++++++++----
parquet-variant/src/variant/object.rs | 74 +++++++++++++++++++++++++++++------
5 files changed, 96 insertions(+), 37 deletions(-)
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index d67ab9c001..74f8bf2a68 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -288,11 +288,11 @@ impl MetadataBuilder {
/// let variant = Variant::try_new(&metadata, &value).unwrap();
/// let variant_object = variant.as_object().unwrap();
/// assert_eq!(
-/// variant_object.field_by_name("first_name").unwrap(),
+/// variant_object.get("first_name"),
/// Some(Variant::from("Jiaying"))
/// );
/// assert_eq!(
-/// variant_object.field_by_name("last_name").unwrap(),
+/// variant_object.get("last_name"),
/// Some(Variant::from("Li"))
/// );
/// ```
@@ -367,11 +367,11 @@ impl MetadataBuilder {
/// let obj1_variant = variant_list.get(0).unwrap();
/// let obj1 = obj1_variant.as_object().unwrap();
/// assert_eq!(
-/// obj1.field_by_name("id").unwrap(),
+/// obj1.get("id"),
/// Some(Variant::from(1))
/// );
/// assert_eq!(
-/// obj1.field_by_name("type").unwrap(),
+/// obj1.get("type"),
/// Some(Variant::from("Cauliflower"))
/// );
///
@@ -379,11 +379,11 @@ impl MetadataBuilder {
/// let obj2 = obj2_variant.as_object().unwrap();
///
/// assert_eq!(
-/// obj2.field_by_name("id").unwrap(),
+/// obj2.get("id"),
/// Some(Variant::from(2))
/// );
/// assert_eq!(
-/// obj2.field_by_name("type").unwrap(),
+/// obj2.get("type"),
/// Some(Variant::from("Beets"))
/// );
///
diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs
index e0f966cab8..765ea04ae6 100644
--- a/parquet-variant/src/utils.rs
+++ b/parquet-variant/src/utils.rs
@@ -94,33 +94,33 @@ pub(crate) fn string_from_slice(
/// * `range` - The range to search in
/// * `target` - The target value to search for
/// * `key_extractor` - A function that extracts a comparable key from slice
elements.
-/// This function can fail and return an error.
+/// This function can fail and return None.
///
/// # Returns
-/// * `Ok(Ok(index))` - Element found at the given index
-/// * `Ok(Err(index))` - Element not found, but would be inserted at the given
index
-/// * `Err(e)` - Key extraction failed with error `e`
-pub(crate) fn try_binary_search_range_by<K, E, F>(
+/// * `Some(Ok(index))` - Element found at the given index
+/// * `Some(Err(index))` - Element not found, but would be inserted at the
given index
+/// * `None` - Key extraction failed
+pub(crate) fn try_binary_search_range_by<K, F>(
range: Range<usize>,
target: &K,
- mut key_extractor: F,
-) -> Result<Result<usize, usize>, E>
+ key_extractor: F,
+) -> Option<Result<usize, usize>>
where
K: Ord,
- F: FnMut(usize) -> Result<K, E>,
+ F: Fn(usize) -> Option<K>,
{
let Range { mut start, mut end } = range;
while start < end {
let mid = start + (end - start) / 2;
let key = key_extractor(mid)?;
match key.cmp(target) {
- std::cmp::Ordering::Equal => return Ok(Ok(mid)),
+ std::cmp::Ordering::Equal => return Some(Ok(mid)),
std::cmp::Ordering::Greater => end = mid,
std::cmp::Ordering::Less => start = mid + 1,
}
}
- Ok(Err(start))
+ Some(Err(start))
}
/// Attempts to prove a fallible iterator is actually infallible in practice,
by consuming every
diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs
index 4ca23aee0f..28583f1658 100644
--- a/parquet-variant/src/variant.rs
+++ b/parquet-variant/src/variant.rs
@@ -830,7 +830,7 @@ impl<'m, 'v> Variant<'m, 'v> {
/// let variant = Variant::try_new(&metadata, &value).unwrap();
/// // use the `as_object` method to access the object
/// let obj = variant.as_object().expect("variant should be an object");
- /// assert_eq!(obj.field_by_name("name").unwrap(),
Some(Variant::from("John")));
+ /// assert_eq!(obj.get("name"), Some(Variant::from("John")));
/// ```
pub fn as_object(&'m self) -> Option<&'m VariantObject<'m, 'v>> {
if let Variant::Object(obj) = self {
diff --git a/parquet-variant/src/variant/list.rs
b/parquet-variant/src/variant/list.rs
index 703761b420..42f97fa0d3 100644
--- a/parquet-variant/src/variant/list.rs
+++ b/parquet-variant/src/variant/list.rs
@@ -119,7 +119,20 @@ impl<'m, 'v> VariantList<'m, 'v> {
self.len() == 0
}
- pub fn get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+ /// Returns element by index in `0..self.len()`, if any
+ pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
+ if index >= self.num_elements {
+ return None;
+ }
+
+ match self.try_get(index) {
+ Ok(variant) => Some(variant),
+ Err(err) => panic!("validation error: {}", err),
+ }
+ }
+
+ /// Fallible version of `get`. Returns element by index, capturing
validation errors
+ fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
if index >= self.num_elements {
return Err(ArrowError::InvalidArgumentError(format!(
"Index {} out of bounds for list of length {}",
@@ -153,7 +166,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
// Fallible iteration over the fields of this dictionary. The constructor
traverses the iterator
// to prove it has no errors, so that all other use sites can blindly
`unwrap` the result.
fn iter_checked(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>,
ArrowError>> + '_ {
- (0..self.len()).map(move |i| self.get(i))
+ (0..self.len()).map(move |i| self.try_get(i))
}
}
@@ -208,11 +221,7 @@ mod tests {
// Test out of bounds access
let out_of_bounds = variant_list.get(3);
- assert!(out_of_bounds.is_err());
- assert!(matches!(
- out_of_bounds.unwrap_err(),
- ArrowError::InvalidArgumentError(ref msg) if msg.contains("out of
bounds")
- ));
+ assert!(out_of_bounds.is_none());
// Test values iterator
let values: Vec<_> = variant_list.iter().collect();
@@ -248,7 +257,7 @@ mod tests {
// Test out of bounds access on empty list
let out_of_bounds = variant_list.get(0);
- assert!(out_of_bounds.is_err());
+ assert!(out_of_bounds.is_none());
// Test values iterator on empty list
let values: Vec<_> = variant_list.iter().collect();
diff --git a/parquet-variant/src/variant/object.rs
b/parquet-variant/src/variant/object.rs
index b52701f8bb..9530f111f1 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -145,7 +145,19 @@ impl<'m, 'v> VariantObject<'m, 'v> {
}
/// Get a field's value by index in `0..self.len()`
- pub fn field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+ ///
+ /// # Panics
+ /// If the variant object is corrupted (e.g., invalid offsets or field
IDs).
+ /// This should never happen since the constructor validates all data
upfront.
+ pub fn field(&self, i: usize) -> Option<Variant<'m, 'v>> {
+ Some(
+ self.try_field(i)
+ .expect("validation error after construction"),
+ )
+ }
+
+ /// Fallible version of `field`. Returns field value by index, capturing
validation errors
+ fn try_field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
let start_offset = self.header.field_offset_size.unpack_usize(
self.value,
self.field_offsets_start_byte,
@@ -160,7 +172,19 @@ impl<'m, 'v> VariantObject<'m, 'v> {
}
/// Get a field's name by index in `0..self.len()`
- pub fn field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
+ ///
+ /// # Panics
+ /// If the variant object is corrupted (e.g., invalid offsets or field
IDs).
+ /// This should never happen since the constructor validates all data
upfront.
+ pub fn field_name(&self, i: usize) -> Option<&'m str> {
+ Some(
+ self.try_field_name(i)
+ .expect("validation error after construction"),
+ )
+ }
+
+ /// Fallible version of `field_name`. Returns field name by index,
capturing validation errors
+ fn try_field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
let field_id =
self.header
.field_id_size
@@ -179,22 +203,22 @@ impl<'m, 'v> VariantObject<'m, 'v> {
fn iter_checked(
&self,
) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>>
+ '_ {
- (0..self.num_elements).map(move |i| Ok((self.field_name(i)?,
self.field(i)?)))
+ (0..self.num_elements).map(move |i| Ok((self.try_field_name(i)?,
self.try_field(i)?)))
}
/// Returns the value of the field with the specified name, if any.
///
/// `Ok(None)` means the field does not exist; `Err` means the search
encountered an error.
- pub fn field_by_name(&self, name: &str) -> Result<Option<Variant<'m, 'v>>,
ArrowError> {
+ pub fn get(&self, name: &str) -> Option<Variant<'m, 'v>> {
// Binary search through the field IDs of this object to find the
requested field name.
//
// NOTE: This does not require a sorted metadata dictionary, because
the variant spec
// requires object field ids to be lexically sorted by their
corresponding string values,
// and probing the dictionary for a field id is always O(1) work.
- let search_result =
- try_binary_search_range_by(0..self.num_elements, &name, |i|
self.field_name(i))?;
+ let i = try_binary_search_range_by(0..self.num_elements, &name, |i|
self.field_name(i))?
+ .ok()?;
- search_result.ok().map(|i| self.field(i)).transpose()
+ self.field(i)
}
}
@@ -260,22 +284,37 @@ mod tests {
assert!(!variant_obj.is_empty());
// Test field access
- let active_field = variant_obj.field_by_name("active").unwrap();
+ let active_field = variant_obj.get("active");
assert!(active_field.is_some());
assert_eq!(active_field.unwrap().as_boolean(), Some(true));
- let age_field = variant_obj.field_by_name("age").unwrap();
+ let age_field = variant_obj.get("age");
assert!(age_field.is_some());
assert_eq!(age_field.unwrap().as_int8(), Some(42));
- let name_field = variant_obj.field_by_name("name").unwrap();
+ let name_field = variant_obj.get("name");
assert!(name_field.is_some());
assert_eq!(name_field.unwrap().as_string(), Some("hello"));
// Test non-existent field
- let missing_field = variant_obj.field_by_name("missing").unwrap();
+ let missing_field = variant_obj.get("missing");
assert!(missing_field.is_none());
+ // https://github.com/apache/arrow-rs/issues/7784
+ // Fixme: The following assertion will panic! That is not good
+ // let missing_field_name = variant_obj.field_name(3);
+ // assert!(missing_field_name.is_none());
+ //
+ // Fixme: The `.field_name()` will panic! This is not good
+ // let missing_field_name = variant_obj.field_name(300);
+ // assert!(missing_field_name.is_none());
+
+ // let missing_field_value = variant_obj.field(3);
+ // assert!(missing_field_value.is_none());
+
+ // let missing_field_value = variant_obj.field(300);
+ // assert!(missing_field_value.is_none());
+
// Test fields iterator
let fields: Vec<_> = variant_obj.iter().collect();
assert_eq!(fields.len(), 3);
@@ -289,6 +328,17 @@ mod tests {
assert_eq!(fields[2].0, "name");
assert_eq!(fields[2].1.as_string(), Some("hello"));
+
+ // Test field access by index
+ // Fields should be in sorted order: active, age, name
+ assert_eq!(variant_obj.field_name(0), Some("active"));
+ assert_eq!(variant_obj.field(0).unwrap().as_boolean(), Some(true));
+
+ assert_eq!(variant_obj.field_name(1), Some("age"));
+ assert_eq!(variant_obj.field(1).unwrap().as_int8(), Some(42));
+
+ assert_eq!(variant_obj.field_name(2), Some("name"));
+ assert_eq!(variant_obj.field(2).unwrap().as_string(), Some("hello"));
}
#[test]
@@ -316,7 +366,7 @@ mod tests {
assert!(variant_obj.is_empty());
// Test field access on empty object
- let missing_field = variant_obj.field_by_name("anything").unwrap();
+ let missing_field = variant_obj.get("anything");
assert!(missing_field.is_none());
// Test fields iterator on empty object