This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 2405798912 [Variant] Add flag in `ObjectBuilder` to control validation 
behavior on duplicate field write (#7801)
2405798912 is described below

commit 24057989124de2ef0867035156c6dc8dabed6a49
Author: Michael Renda <[email protected]>
AuthorDate: Tue Jul 1 11:18:33 2025 -0400

    [Variant] Add flag in `ObjectBuilder` to control validation behavior on 
duplicate field write (#7801)
    
    # Which issue does this PR close?
    
    Closes #7777.
    
    # What changes are included in this PR?
    
    Added method `with_validate_unique_fields` that modifies an
    `ObjectBuilder` to throw an error in `finish` if duplicate fields were
    inserted. The error message thrown displays all of the duplicate keys
    that were inserted to the object.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 parquet-variant/benches/variant_builder.rs |  22 ++--
 parquet-variant/src/builder.rs             | 186 +++++++++++++++++++++++++----
 parquet-variant/src/to_json.rs             |  10 +-
 parquet-variant/tests/variant_interop.rs   |   2 +-
 4 files changed, 180 insertions(+), 40 deletions(-)

diff --git a/parquet-variant/benches/variant_builder.rs 
b/parquet-variant/benches/variant_builder.rs
index 432c4192e3..f69e3170c6 100644
--- a/parquet-variant/benches/variant_builder.rs
+++ b/parquet-variant/benches/variant_builder.rs
@@ -77,7 +77,7 @@ fn bench_object_field_names_reverse_order(c: &mut Criterion) {
                 object_builder.insert(format!("{}", 1000 - i).as_str(), 
string_table.next());
             }
 
-            object_builder.finish();
+            object_builder.finish().unwrap();
             hint::black_box(variant.finish());
         })
     });
@@ -113,7 +113,7 @@ fn bench_object_same_schema(c: &mut Criterion) {
                 inner_list_builder.append_value(string_table.next());
 
                 inner_list_builder.finish();
-                object_builder.finish();
+                object_builder.finish().unwrap();
 
                 hint::black_box(variant.finish());
             }
@@ -154,7 +154,7 @@ fn bench_object_list_same_schema(c: &mut Criterion) {
                 list_builder.append_value(string_table.next());
 
                 list_builder.finish();
-                object_builder.finish();
+                object_builder.finish().unwrap();
             }
 
             list_builder.finish();
@@ -189,7 +189,7 @@ fn bench_object_unknown_schema(c: &mut Criterion) {
                             let key = string_table.next();
                             inner_object_builder.insert(key, key);
                         }
-                        inner_object_builder.finish();
+                        inner_object_builder.finish().unwrap();
 
                         continue;
                     }
@@ -202,7 +202,7 @@ fn bench_object_unknown_schema(c: &mut Criterion) {
 
                     inner_list_builder.finish();
                 }
-                object_builder.finish();
+                object_builder.finish().unwrap();
                 hint::black_box(variant.finish());
             }
         })
@@ -241,7 +241,7 @@ fn bench_object_list_unknown_schema(c: &mut Criterion) {
                             let key = string_table.next();
                             inner_object_builder.insert(key, key);
                         }
-                        inner_object_builder.finish();
+                        inner_object_builder.finish().unwrap();
 
                         continue;
                     }
@@ -254,7 +254,7 @@ fn bench_object_list_unknown_schema(c: &mut Criterion) {
 
                     inner_list_builder.finish();
                 }
-                object_builder.finish();
+                object_builder.finish().unwrap();
             }
 
             list_builder.finish();
@@ -314,10 +314,10 @@ fn bench_object_partially_same_schema(c: &mut Criterion) {
                         let key = string_table.next();
                         inner_object_builder.insert(key, key);
                     }
-                    inner_object_builder.finish();
+                    inner_object_builder.finish().unwrap();
                 }
 
-                object_builder.finish();
+                object_builder.finish().unwrap();
                 hint::black_box(variant.finish());
             }
         })
@@ -376,10 +376,10 @@ fn bench_object_list_partially_same_schema(c: &mut 
Criterion) {
                         let key = string_table.next();
                         inner_object_builder.insert(key, key);
                     }
-                    inner_object_builder.finish();
+                    inner_object_builder.finish().unwrap();
                 }
 
-                object_builder.finish();
+                object_builder.finish().unwrap();
             }
 
             list_builder.finish();
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index f0f3237147..3a8f7af6a0 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -16,7 +16,9 @@
 // under the License.
 use crate::decoder::{VariantBasicType, VariantPrimitiveType};
 use crate::{ShortString, Variant, VariantDecimal16, VariantDecimal4, 
VariantDecimal8};
+use arrow_schema::ArrowError;
 use indexmap::{IndexMap, IndexSet};
+use std::collections::HashSet;
 
 const BASIC_TYPE_BITS: u8 = 2;
 const UNIX_EPOCH_DATE: chrono::NaiveDate = 
chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
@@ -435,10 +437,27 @@ impl MetadataBuilder {
 /// );
 ///
 /// ```
+/// # Example: Unique Field Validation
+///
+/// This example shows how enabling unique field validation will cause an error
+/// if the same field is inserted more than once.
+/// ```
+/// use parquet_variant::VariantBuilder;
+///
+/// let mut builder = VariantBuilder::new().with_validate_unique_fields(true);
+/// let mut obj = builder.new_object();
+///
+/// obj.insert("a", 1);
+/// obj.insert("a", 2); // duplicate field
+///
+/// let result = obj.finish(); // returns Err
+/// assert!(result.is_err());
+/// ```
 #[derive(Default)]
 pub struct VariantBuilder {
     buffer: ValueBuffer,
     metadata_builder: MetadataBuilder,
+    validate_unique_fields: bool,
 }
 
 impl VariantBuilder {
@@ -446,14 +465,26 @@ impl VariantBuilder {
         Self {
             buffer: ValueBuffer::default(),
             metadata_builder: MetadataBuilder::default(),
+            validate_unique_fields: false,
         }
     }
 
+    /// Enables validation of unique field keys in nested objects.
+    ///
+    /// This setting is propagated to all [`ObjectBuilder`]s created through 
this [`VariantBuilder`]
+    /// (including via any [`ListBuilder`]), and causes 
[`ObjectBuilder::finish()`] to return
+    /// an error if duplicate keys were inserted.
+    pub fn with_validate_unique_fields(mut self, validate_unique_fields: bool) 
-> Self {
+        self.validate_unique_fields = validate_unique_fields;
+        self
+    }
+
     /// Create an [`ListBuilder`] for creating [`Variant::List`] values.
     ///
     /// See the examples on [`VariantBuilder`] for usage.
     pub fn new_list(&mut self) -> ListBuilder {
         ListBuilder::new(&mut self.buffer, &mut self.metadata_builder)
+            .with_validate_unique_fields(self.validate_unique_fields)
     }
 
     /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values.
@@ -461,6 +492,7 @@ impl VariantBuilder {
     /// See the examples on [`VariantBuilder`] for usage.
     pub fn new_object(&mut self) -> ObjectBuilder {
         ObjectBuilder::new(&mut self.buffer, &mut self.metadata_builder)
+            .with_validate_unique_fields(self.validate_unique_fields)
     }
 
     pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) 
{
@@ -482,6 +514,7 @@ pub struct ListBuilder<'a> {
     buffer: ValueBuffer,
     /// Is there a pending nested object or list that needs to be finalized?
     pending: bool,
+    validate_unique_fields: bool,
 }
 
 impl<'a> ListBuilder<'a> {
@@ -492,6 +525,7 @@ impl<'a> ListBuilder<'a> {
             offsets: vec![0],
             buffer: ValueBuffer::default(),
             pending: false,
+            validate_unique_fields: false,
         }
     }
 
@@ -506,10 +540,20 @@ impl<'a> ListBuilder<'a> {
         self.pending = false;
     }
 
+    /// Enables unique field key validation for objects created within this 
list.
+    ///
+    /// Propagates the validation flag to any [`ObjectBuilder`]s created using
+    /// [`ListBuilder::new_object`].
+    pub fn with_validate_unique_fields(mut self, validate_unique_fields: bool) 
-> Self {
+        self.validate_unique_fields = validate_unique_fields;
+        self
+    }
+
     pub fn new_object(&mut self) -> ObjectBuilder {
         self.check_new_offset();
 
-        let obj_builder = ObjectBuilder::new(&mut self.buffer, 
self.metadata_builder);
+        let obj_builder = ObjectBuilder::new(&mut self.buffer, 
self.metadata_builder)
+            .with_validate_unique_fields(self.validate_unique_fields);
         self.pending = true;
 
         obj_builder
@@ -518,7 +562,8 @@ impl<'a> ListBuilder<'a> {
     pub fn new_list(&mut self) -> ListBuilder {
         self.check_new_offset();
 
-        let list_builder = ListBuilder::new(&mut self.buffer, 
self.metadata_builder);
+        let list_builder = ListBuilder::new(&mut self.buffer, 
self.metadata_builder)
+            .with_validate_unique_fields(self.validate_unique_fields);
         self.pending = true;
 
         list_builder
@@ -568,6 +613,9 @@ pub struct ObjectBuilder<'a, 'b> {
     buffer: ValueBuffer,
     /// Is there a pending list or object that needs to be finalized?
     pending: Option<(&'b str, usize)>,
+    validate_unique_fields: bool,
+    /// Set of duplicate fields to report for errors
+    duplicate_fields: HashSet<u32>,
 }
 
 impl<'a, 'b> ObjectBuilder<'a, 'b> {
@@ -578,6 +626,8 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
             fields: IndexMap::new(),
             buffer: ValueBuffer::default(),
             pending: None,
+            validate_unique_fields: false,
+            duplicate_fields: HashSet::new(),
         }
     }
 
@@ -602,17 +652,30 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
         let field_id = self.metadata_builder.upsert_field_name(key);
         let field_start = self.buffer.offset();
 
-        self.fields.insert(field_id, field_start);
+        if self.fields.insert(field_id, field_start).is_some() && 
self.validate_unique_fields {
+            self.duplicate_fields.insert(field_id);
+        }
+
         self.buffer.append_non_nested_value(value);
     }
 
+    /// Enables validation for unique field keys when inserting into this 
object.
+    ///
+    /// When this is enabled, calling [`ObjectBuilder::finish`] will return an 
error
+    /// if any duplicate field keys were added using [`ObjectBuilder::insert`].
+    pub fn with_validate_unique_fields(mut self, validate_unique_fields: bool) 
-> Self {
+        self.validate_unique_fields = validate_unique_fields;
+        self
+    }
+
     /// Return a new [`ObjectBuilder`] to add a nested object with the 
specified
     /// key to the object.
     pub fn new_object(&mut self, key: &'b str) -> ObjectBuilder {
         self.check_pending_field();
 
         let field_start = self.buffer.offset();
-        let obj_builder = ObjectBuilder::new(&mut self.buffer, 
self.metadata_builder);
+        let obj_builder = ObjectBuilder::new(&mut self.buffer, 
self.metadata_builder)
+            .with_validate_unique_fields(self.validate_unique_fields);
         self.pending = Some((key, field_start));
 
         obj_builder
@@ -624,7 +687,8 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
         self.check_pending_field();
 
         let field_start = self.buffer.offset();
-        let list_builder = ListBuilder::new(&mut self.buffer, 
self.metadata_builder);
+        let list_builder = ListBuilder::new(&mut self.buffer, 
self.metadata_builder)
+            .with_validate_unique_fields(self.validate_unique_fields);
         self.pending = Some((key, field_start));
 
         list_builder
@@ -633,9 +697,24 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
     /// Finalize object
     ///
     /// This consumes self and writes the object to the parent buffer.
-    pub fn finish(mut self) {
+    pub fn finish(mut self) -> Result<(), ArrowError> {
         self.check_pending_field();
 
+        if self.validate_unique_fields && !self.duplicate_fields.is_empty() {
+            let mut names = self
+                .duplicate_fields
+                .iter()
+                .map(|id| self.metadata_builder.field_name(*id as usize))
+                .collect::<Vec<_>>();
+
+            names.sort_unstable();
+
+            let joined = names.join(", ");
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Duplicate field keys detected: [{joined}]",
+            )));
+        }
+
         let data_size = self.buffer.offset();
         let num_fields = self.fields.len();
         let is_large = num_fields > u8::MAX as usize;
@@ -672,6 +751,8 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
         write_offset(self.parent_buffer.inner_mut(), data_size, offset_size);
 
         self.parent_buffer.append_slice(self.buffer.inner());
+
+        Ok(())
     }
 }
 
@@ -821,7 +902,7 @@ mod tests {
             let mut obj = builder.new_object();
             obj.insert("name", "John");
             obj.insert("age", 42i8);
-            obj.finish();
+            let _ = obj.finish();
         }
 
         let (metadata, value) = builder.finish();
@@ -838,7 +919,7 @@ mod tests {
             obj.insert("zebra", "stripes"); // ID = 0
             obj.insert("apple", "red"); // ID = 1
             obj.insert("banana", "yellow"); // ID = 2
-            obj.finish();
+            let _ = obj.finish();
         }
 
         let (_, value) = builder.finish();
@@ -862,7 +943,7 @@ mod tests {
         let mut object_builder = builder.new_object();
         object_builder.insert("name", "Ron Artest");
         object_builder.insert("name", "Metta World Peace");
-        object_builder.finish();
+        let _ = object_builder.finish();
 
         let (metadata, value) = builder.finish();
         let variant = Variant::try_new(&metadata, &value).unwrap();
@@ -983,14 +1064,14 @@ mod tests {
             let mut object_builder = list_builder.new_object();
             object_builder.insert("id", 1);
             object_builder.insert("type", "Cauliflower");
-            object_builder.finish();
+            let _ = object_builder.finish();
         }
 
         {
             let mut object_builder = list_builder.new_object();
             object_builder.insert("id", 2);
             object_builder.insert("type", "Beets");
-            object_builder.finish();
+            let _ = object_builder.finish();
         }
 
         list_builder.finish();
@@ -1031,13 +1112,13 @@ mod tests {
         {
             let mut object_builder = list_builder.new_object();
             object_builder.insert("a", 1);
-            object_builder.finish();
+            let _ = object_builder.finish();
         }
 
         {
             let mut object_builder = list_builder.new_object();
             object_builder.insert("b", 2);
-            object_builder.finish();
+            let _ = object_builder.finish();
         }
 
         list_builder.finish();
@@ -1084,7 +1165,7 @@ mod tests {
         {
             let mut object_builder = list_builder.new_object();
             object_builder.insert("a", 1);
-            object_builder.finish();
+            let _ = object_builder.finish();
         }
 
         list_builder.append_value(2);
@@ -1092,7 +1173,7 @@ mod tests {
         {
             let mut object_builder = list_builder.new_object();
             object_builder.insert("b", 2);
-            object_builder.finish();
+            let _ = object_builder.finish();
         }
 
         list_builder.append_value(3);
@@ -1142,10 +1223,10 @@ mod tests {
             {
                 let mut inner_object_builder = 
outer_object_builder.new_object("c");
                 inner_object_builder.insert("b", "a");
-                inner_object_builder.finish();
+                let _ = inner_object_builder.finish();
             }
 
-            outer_object_builder.finish();
+            let _ = outer_object_builder.finish();
         }
 
         let (metadata, value) = builder.finish();
@@ -1184,11 +1265,11 @@ mod tests {
                 inner_object_builder.insert("b", false);
                 inner_object_builder.insert("c", "a");
 
-                inner_object_builder.finish();
+                let _ = inner_object_builder.finish();
             }
 
             outer_object_builder.insert("b", false);
-            outer_object_builder.finish();
+            let _ = outer_object_builder.finish();
         }
 
         let (metadata, value) = builder.finish();
@@ -1232,10 +1313,10 @@ mod tests {
                     inner_object_list_builder.finish();
                 }
 
-                inner_object_builder.finish();
+                let _ = inner_object_builder.finish();
             }
 
-            outer_object_builder.finish();
+            let _ = outer_object_builder.finish();
         }
 
         let (metadata, value) = builder.finish();
@@ -1280,12 +1361,12 @@ mod tests {
             {
                 let mut inner_object_builder = 
outer_object_builder.new_object("c");
                 inner_object_builder.insert("b", "a");
-                inner_object_builder.finish();
+                let _ = inner_object_builder.finish();
             }
 
             outer_object_builder.insert("b", true);
 
-            outer_object_builder.finish();
+            let _ = outer_object_builder.finish();
         }
 
         let (metadata, value) = builder.finish();
@@ -1321,4 +1402,63 @@ mod tests {
         assert_eq!(outer_object.field_name(1).unwrap(), "b");
         assert_eq!(outer_object.field(1).unwrap(), Variant::from(true));
     }
+
+    #[test]
+    fn test_object_without_unique_field_validation() {
+        let mut builder = VariantBuilder::new();
+
+        // Root object with duplicates
+        let mut obj = builder.new_object();
+        obj.insert("a", 1);
+        obj.insert("a", 2);
+        assert!(obj.finish().is_ok());
+
+        // Deeply nested list structure with duplicates
+        let mut outer_list = builder.new_list();
+        let mut inner_list = outer_list.new_list();
+        let mut nested_obj = inner_list.new_object();
+        nested_obj.insert("x", 1);
+        nested_obj.insert("x", 2);
+        assert!(nested_obj.finish().is_ok());
+    }
+
+    #[test]
+    fn test_object_with_unique_field_validation() {
+        let mut builder = 
VariantBuilder::new().with_validate_unique_fields(true);
+
+        // Root-level object with duplicates
+        let mut root_obj = builder.new_object();
+        root_obj.insert("a", 1);
+        root_obj.insert("b", 2);
+        root_obj.insert("a", 3);
+        root_obj.insert("b", 4);
+
+        let result = root_obj.finish();
+        assert_eq!(
+            result.unwrap_err().to_string(),
+            "Invalid argument error: Duplicate field keys detected: [a, b]"
+        );
+
+        // Deeply nested list -> list -> object with duplicate
+        let mut outer_list = builder.new_list();
+        let mut inner_list = outer_list.new_list();
+        let mut nested_obj = inner_list.new_object();
+        nested_obj.insert("x", 1);
+        nested_obj.insert("x", 2);
+
+        let nested_result = nested_obj.finish();
+        assert_eq!(
+            nested_result.unwrap_err().to_string(),
+            "Invalid argument error: Duplicate field keys detected: [x]"
+        );
+
+        // Valid object should succeed
+        let mut list = builder.new_list();
+        let mut valid_obj = list.new_object();
+        valid_obj.insert("m", 1);
+        valid_obj.insert("n", 2);
+
+        let valid_result = valid_obj.finish();
+        assert!(valid_result.is_ok());
+    }
 }
diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs
index 07ce7b83d1..b27fca6108 100644
--- a/parquet-variant/src/to_json.rs
+++ b/parquet-variant/src/to_json.rs
@@ -859,7 +859,7 @@ mod tests {
             obj.insert("age", 30i32);
             obj.insert("active", true);
             obj.insert("score", 95.5f64);
-            obj.finish();
+            obj.finish().unwrap();
         }
 
         let (metadata, value) = builder.finish();
@@ -890,7 +890,7 @@ mod tests {
 
         {
             let obj = builder.new_object();
-            obj.finish();
+            obj.finish().unwrap();
         }
 
         let (metadata, value) = builder.finish();
@@ -915,7 +915,7 @@ mod tests {
             obj.insert("message", "Hello \"World\"\nWith\tTabs");
             obj.insert("path", "C:\\Users\\Alice\\Documents");
             obj.insert("unicode", "😀 Smiley");
-            obj.finish();
+            obj.finish().unwrap();
         }
 
         let (metadata, value) = builder.finish();
@@ -1030,7 +1030,7 @@ mod tests {
             obj.insert("zebra", "last");
             obj.insert("alpha", "first");
             obj.insert("beta", "second");
-            obj.finish();
+            obj.finish().unwrap();
         }
 
         let (metadata, value) = builder.finish();
@@ -1098,7 +1098,7 @@ mod tests {
             obj.insert("float_field", 2.71f64);
             obj.insert("null_field", ());
             obj.insert("long_field", 999i64);
-            obj.finish();
+            obj.finish().unwrap();
         }
 
         let (metadata, value) = builder.finish();
diff --git a/parquet-variant/tests/variant_interop.rs 
b/parquet-variant/tests/variant_interop.rs
index 20ad7899f2..dcf1200d33 100644
--- a/parquet-variant/tests/variant_interop.rs
+++ b/parquet-variant/tests/variant_interop.rs
@@ -264,7 +264,7 @@ fn variant_object_builder() {
     obj.insert("null_field", ());
     obj.insert("timestamp_field", "2025-04-16T12:34:56.78");
 
-    obj.finish();
+    obj.finish().unwrap();
 
     let (built_metadata, built_value) = builder.finish();
     let actual = Variant::try_new(&built_metadata, &built_value).unwrap();

Reply via email to