scovich commented on code in PR #8167:
URL: https://github.com/apache/arrow-rs/pull/8167#discussion_r2283264736


##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -73,13 +76,13 @@ pub struct VariantArrayBuilder {
     /// Nulls
     nulls: NullBufferBuilder,
     /// buffer for all the metadata
-    metadata_buffer: Vec<u8>,
+    metadata_builder: MetadataBuilderXX,

Review Comment:
   Because the metadata builder is now "reusable" (each new `finish` call just 
appends another chunk of bytes to the underlying byte buffer), we can 
instantiate it directly and just pass it by reference to the 
`VariantArrayVariantBuilder` helper class.



##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -73,13 +76,13 @@ pub struct VariantArrayBuilder {
     /// Nulls
     nulls: NullBufferBuilder,
     /// buffer for all the metadata
-    metadata_buffer: Vec<u8>,
+    metadata_builder: MetadataBuilderXX,
     /// (offset, len) pairs for locations of metadata in the buffer
-    metadata_locations: Vec<(usize, usize)>,
+    metadata_offsets: Vec<usize>,

Review Comment:
   Track starting offsets (similar to normal variant code), and infer the 
length of each entry as the difference between adjacent offsets. Ditto for 
`value_offsets` below.



##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -73,13 +76,13 @@ pub struct VariantArrayBuilder {
     /// Nulls
     nulls: NullBufferBuilder,
     /// buffer for all the metadata
-    metadata_buffer: Vec<u8>,
+    metadata_builder: MetadataBuilderXX,
     /// (offset, len) pairs for locations of metadata in the buffer
-    metadata_locations: Vec<(usize, usize)>,
+    metadata_offsets: Vec<usize>,
     /// buffer for values
-    value_buffer: Vec<u8>,
+    value_builder: ValueBuilder,

Review Comment:
   This used to be called `ValueBuffer`, and it turns out to have a better API 
for our purposes than `VariantBuilder`. So we no longer create a 
`VariantBuilder` at all, and just pass the value and metadata builders around 
by reference as needed. 



##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -236,30 +229,22 @@ impl<'a> VariantArrayVariantBuilder<'a> {
     ///
     /// Note this is not public as this is a structure that is logically
     /// part of the [`VariantArrayBuilder`] and relies on its internal 
structure
-    fn new(
-        array_builder: &'a mut VariantArrayBuilder,
-        metadata_buffer: Vec<u8>,
-        value_buffer: Vec<u8>,
-    ) -> Self {
-        let metadata_offset = metadata_buffer.len();
-        let value_offset = value_buffer.len();
+    fn new(array_builder: &'a mut VariantArrayBuilder) -> Self {
+        let metadata_offset = array_builder.metadata_builder.offset();
+        let value_offset = array_builder.value_builder.offset();
         VariantArrayVariantBuilder {
             finished: false,
             metadata_offset,
             value_offset,
-            variant_builder: VariantBuilder::new_with_buffers(metadata_buffer, 
value_buffer),
             array_builder,
         }
     }
 
-    /// Return a reference to the underlying `VariantBuilder`
-    pub fn inner(&self) -> &VariantBuilder {
-        &self.variant_builder
-    }
-
-    /// Return a mutable reference to the underlying `VariantBuilder`
-    pub fn inner_mut(&mut self) -> &mut VariantBuilder {
-        &mut self.variant_builder
+    fn parent_state(&mut self) -> ParentState<'_> {
+        ParentState::variant(
+            &mut self.array_builder.value_builder,
+            &mut self.array_builder.metadata_builder,

Review Comment:
   This is one of the important bits of magic -- we leverage the new parent 
state flavored `ValueBuilder` API, which allows us to construct an appropriate 
`ParentState` directly from our internal builders. No need to pass ownership 
back and forth between the `VariantArrayBuilder` and short-lived 
`VariantBuilder` instances.



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -331,6 +331,11 @@ impl<'m> VariantMetadata<'m> {
         self.iter_try()
             .map(|result| result.expect("Invalid metadata dictionary entry"))
     }
+
+    /// Same as `Index::index`, but with the correct lifetime.
+    pub(crate) fn get_infallible(&self, i: usize) -> &'m str {

Review Comment:
   `Index::index` is defined to return values with the lifetime of `&self`, 
which is shorter than the string's true lifetime `'m`. Which makes it 
impossible to use the result of `metadata[i]` in a hash map storing `&'m str`.



##########
parquet-variant/src/builder.rs:
##########
@@ -86,28 +87,16 @@ fn append_packed_u32(dest: &mut Vec<u8>, value: u32, 
value_size: usize) {
 ///
 /// You can reuse an existing `Vec<u8>` by using the `from` impl
 #[derive(Debug, Default)]
-struct ValueBuffer(Vec<u8>);
+pub struct ValueBuilder(Vec<u8>);
 
-impl ValueBuffer {
+impl ValueBuilder {
     /// Construct a ValueBuffer that will write to a new underlying `Vec`
-    fn new() -> Self {
+    pub fn new() -> Self {
         Default::default()
     }
 }
 
-impl From<Vec<u8>> for ValueBuffer {
-    fn from(value: Vec<u8>) -> Self {
-        Self(value)
-    }
-}
-
-impl From<ValueBuffer> for Vec<u8> {
-    fn from(value_buffer: ValueBuffer) -> Self {
-        value_buffer.0
-    }
-}

Review Comment:
   Supporting code for the old `VariantArrayBuilder` approach. 
   
   Open question whether we think it's helpful to keep even tho it's not 
strictly needed any more?



##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -272,84 +257,46 @@ impl<'a> VariantArrayVariantBuilder<'a> {
 
         let metadata_offset = self.metadata_offset;
         let value_offset = self.value_offset;
-        // get the buffers back from the variant builder
-        let (metadata_buffer, value_buffer) = std::mem::take(&mut 
self.variant_builder).finish();
+
+        let metadata_builder = &mut self.array_builder.metadata_builder;
+        let value_builder = &mut self.array_builder.value_builder;
 
         // Sanity Check: if the buffers got smaller, something went wrong 
(previous data was lost)
-        let metadata_len = metadata_buffer
-            .len()
-            .checked_sub(metadata_offset)
-            .expect("metadata length decreased unexpectedly");
-        let value_len = value_buffer
-            .len()
-            .checked_sub(value_offset)
-            .expect("value length decreased unexpectedly");
+        assert!(
+            metadata_offset <= metadata_builder.offset(),
+            "metadata length decreased unexpectedly"
+        );
+        assert!(
+            value_offset <= value_builder.offset(),
+            "value length decreased unexpectedly"
+        );
+
+        metadata_builder.finish();
 
         // commit the changes by putting the
         // offsets and lengths into the parent array builder.
-        self.array_builder
-            .metadata_locations
-            .push((metadata_offset, metadata_len));
-        self.array_builder
-            .value_locations
-            .push((value_offset, value_len));
+        self.array_builder.metadata_offsets.push(metadata_offset);
+        self.array_builder.value_offsets.push(value_offset);

Review Comment:
   The `ParentState` API doesn't cover these fields, so we take care of them 
here.
   
   The code would be a bit simpler if we stored end offsets instead of starting 
offsets, but I couldn't get it to work for some reason (failing unit tests with 
wrong offsets). See the reverted commit for details.



##########
parquet-variant/src/builder.rs:
##########
@@ -285,97 +264,78 @@ impl ValueBuffer {
         Ok(())
     }
 
-    fn offset(&self) -> usize {
+    pub fn offset(&self) -> usize {
         self.0.len()
     }
 
-    fn new_object<'a>(
-        &'a mut self,
-        metadata_builder: &'a mut MetadataBuilder,
-    ) -> ObjectBuilder<'a> {
-        let parent_state = ParentState::Variant {
-            buffer: self,
-            metadata_builder,
-        };
-        let validate_unique_fields = false;
-        ObjectBuilder::new(parent_state, validate_unique_fields)
-    }
-
-    fn new_list<'a>(&'a mut self, metadata_builder: &'a mut MetadataBuilder) 
-> ListBuilder<'a> {
-        let parent_state = ParentState::Variant {
-            buffer: self,
-            metadata_builder,
-        };
-        let validate_unique_fields = false;
-        ListBuilder::new(parent_state, validate_unique_fields)
-    }
-
-    /// Appends a variant to the buffer.
+    /// Appends a variant to the provided [`ParentState`] instance.
     ///
     /// # Panics
     ///
     /// This method will panic if the variant contains duplicate field names 
in objects
-    /// when validation is enabled. For a fallible version, use 
[`ValueBuffer::try_append_variant`]
-    fn append_variant<'m, 'd>(
-        &mut self,
-        variant: Variant<'m, 'd>,
-        metadata_builder: &mut MetadataBuilder,
-    ) {
+    /// when validation is enabled. For a fallible version, use 
[`ValueBuilder::try_append_variant`]
+    fn append_variant(mut state: ParentState<'_>, variant: Variant<'_, '_>) {
+        let buffer = state.buffer();
         match variant {
-            Variant::Null => self.append_null(),
-            Variant::BooleanTrue => self.append_bool(true),
-            Variant::BooleanFalse => self.append_bool(false),
-            Variant::Int8(v) => self.append_int8(v),
-            Variant::Int16(v) => self.append_int16(v),
-            Variant::Int32(v) => self.append_int32(v),
-            Variant::Int64(v) => self.append_int64(v),
-            Variant::Date(v) => self.append_date(v),
-            Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
-            Variant::TimestampNtzMicros(v) => 
self.append_timestamp_ntz_micros(v),
-            Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
-            Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
-            Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
-            Variant::Float(v) => self.append_float(v),
-            Variant::Double(v) => self.append_double(v),
-            Variant::Binary(v) => self.append_binary(v),
-            Variant::String(s) => self.append_string(s),
-            Variant::ShortString(s) => self.append_short_string(s),
-            Variant::Object(obj) => self.append_object(metadata_builder, obj),
-            Variant::List(list) => self.append_list(metadata_builder, list),
-            Variant::Time(v) => self.append_time_micros(v),
+            Variant::Null => buffer.append_null(),
+            Variant::BooleanTrue => buffer.append_bool(true),
+            Variant::BooleanFalse => buffer.append_bool(false),
+            Variant::Int8(v) => buffer.append_int8(v),
+            Variant::Int16(v) => buffer.append_int16(v),
+            Variant::Int32(v) => buffer.append_int32(v),
+            Variant::Int64(v) => buffer.append_int64(v),
+            Variant::Date(v) => buffer.append_date(v),
+            Variant::Time(v) => buffer.append_time_micros(v),
+            Variant::TimestampMicros(v) => buffer.append_timestamp_micros(v),
+            Variant::TimestampNtzMicros(v) => 
buffer.append_timestamp_ntz_micros(v),
+            Variant::Decimal4(decimal4) => buffer.append_decimal4(decimal4),
+            Variant::Decimal8(decimal8) => buffer.append_decimal8(decimal8),
+            Variant::Decimal16(decimal16) => 
buffer.append_decimal16(decimal16),
+            Variant::Float(v) => buffer.append_float(v),
+            Variant::Double(v) => buffer.append_double(v),
+            Variant::Binary(v) => buffer.append_binary(v),
+            Variant::String(s) => buffer.append_string(s),
+            Variant::ShortString(s) => buffer.append_short_string(s),
+            Variant::Object(obj) => return Self::append_object(state, obj),
+            Variant::List(list) => return Self::append_list(state, list),
         }
+        state.finish();
     }
 
-    /// Appends a variant to the buffer
-    fn try_append_variant<'m, 'd>(
-        &mut self,
-        variant: Variant<'m, 'd>,
-        metadata_builder: &mut MetadataBuilder,
+    /// Tries to append a variant to the provided [`ParentState`] instance.
+    ///
+    /// The attempt fails if the variant contains duplicate field names in 
objects when validation
+    /// is enabled.
+    pub fn try_append_variant(

Review Comment:
   This is _exactly_ the same as `append_variant` above, but using fallible 
code paths instead.
   
   Not sure if there's a nice way to harmonize them?



##########
parquet-variant/src/builder.rs:
##########
@@ -437,13 +397,76 @@ impl ValueBuffer {
     }
 }
 
+pub trait MetadataBuilder: std::fmt::Debug {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError>;
+    fn field_name(&self, field_id: usize) -> &str;
+    fn num_field_names(&self) -> usize;
+    fn truncate_field_names(&mut self, new_size: usize);
+}

Review Comment:
   This is the actual surface area of the metadata builder, if we ignore fancy 
stuff `VariantBuilder` and `VariantArrayBuilder` do with it in their 
constructors and finalizers.



##########
parquet-variant/src/builder.rs:
##########
@@ -589,82 +602,202 @@ impl<S: AsRef<str>> Extend<S> for MetadataBuilder {
 /// parent, and we cannot "split" a mutable reference across two objects 
(parent state and the child
 /// builder that uses it). So everything has to be here. Rust layout 
optimizations should treat the
 /// variants as a union, so that accessing a `buffer` or `metadata_builder` is 
branch-free.
-enum ParentState<'a> {
+#[derive(Debug)]
+pub enum ParentState<'a> {
     Variant {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
+        finished: bool,
     },
     List {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
-        parent_value_offset_base: usize,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
         offsets: &'a mut Vec<usize>,
+        saved_offsets_size: usize,
+        finished: bool,
     },
     Object {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
         fields: &'a mut IndexMap<u32, usize>,
-        field_name: &'a str,
-        parent_value_offset_base: usize,
+        saved_fields_size: usize,

Review Comment:
   When the parent is an object builder, we eagerly record the (field id, 
starting offset) pair for the new object field to be created, rolling back the 
change if `finish` is not called.



##########
parquet-variant/src/builder.rs:
##########
@@ -589,82 +602,202 @@ impl<S: AsRef<str>> Extend<S> for MetadataBuilder {
 /// parent, and we cannot "split" a mutable reference across two objects 
(parent state and the child
 /// builder that uses it). So everything has to be here. Rust layout 
optimizations should treat the
 /// variants as a union, so that accessing a `buffer` or `metadata_builder` is 
branch-free.
-enum ParentState<'a> {
+#[derive(Debug)]
+pub enum ParentState<'a> {
     Variant {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
+        finished: bool,
     },
     List {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
-        parent_value_offset_base: usize,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
         offsets: &'a mut Vec<usize>,
+        saved_offsets_size: usize,
+        finished: bool,
     },
     Object {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
         fields: &'a mut IndexMap<u32, usize>,
-        field_name: &'a str,
-        parent_value_offset_base: usize,
+        saved_fields_size: usize,
+        finished: bool,
     },
 }
 
-impl ParentState<'_> {
-    fn buffer(&mut self) -> &mut ValueBuffer {
-        match self {
-            ParentState::Variant { buffer, .. } => buffer,
-            ParentState::List { buffer, .. } => buffer,
-            ParentState::Object { buffer, .. } => buffer,
+impl<'a> ParentState<'a> {
+    pub fn variant(
+        buffer: &'a mut ValueBuilder,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+    ) -> Self {
+        ParentState::Variant {
+            saved_buffer_offset: buffer.offset(),
+            saved_metadata_builder_dict_size: 
metadata_builder.num_field_names(),
+            buffer,
+            metadata_builder,
+            finished: false,
+        }
+    }
+
+    pub fn list(
+        buffer: &'a mut ValueBuilder,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        offsets: &'a mut Vec<usize>,
+        saved_parent_buffer_offset: usize,
+    ) -> Self {
+        // The saved_parent_buffer_offset is the buffer size as of when the 
parent builder was
+        // constructed. The saved_buffer_offset is the buffer size as of now 
(when a child builder
+        // is created). The variant field_offset entry for this list element 
is their difference.
+        let saved_buffer_offset = buffer.offset();
+        let saved_offsets_size = offsets.len();
+        offsets.push(saved_buffer_offset - saved_parent_buffer_offset);
+
+        ParentState::List {
+            saved_metadata_builder_dict_size: 
metadata_builder.num_field_names(),
+            saved_buffer_offset,
+            saved_offsets_size,
+            metadata_builder,
+            buffer,
+            offsets,
+            finished: false,
+        }
+    }
+
+    pub fn object(
+        buffer: &'a mut ValueBuilder,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        fields: &'a mut IndexMap<u32, usize>,
+        saved_parent_buffer_offset: usize,
+        field_name: &str,
+        validate_unique_fields: bool,
+    ) -> Result<Self, ArrowError> {
+        // The saved_parent_buffer_offset is the buffer size as of when the 
parent builder was
+        // constructed. The saved_buffer_offset is the buffer size as of now 
(when a child builder
+        // is created). The variant field_offset entry for this field is their 
difference.
+        let saved_buffer_offset = buffer.offset();
+        let saved_fields_size = fields.len();
+        let saved_metadata_builder_dict_size = 
metadata_builder.num_field_names();
+        let field_id = metadata_builder.try_upsert_field_name(field_name)?;
+        let field_start = saved_buffer_offset - saved_parent_buffer_offset;
+        if fields.insert(field_id, field_start).is_some() && 
validate_unique_fields {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Duplicate field: {field_name}"
+            )));
         }
+
+        Ok(ParentState::Object {
+            saved_metadata_builder_dict_size,
+            saved_buffer_offset,
+            saved_fields_size,
+            buffer,
+            metadata_builder,
+            fields,
+            finished: false,
+        })
     }
 
-    fn metadata_builder(&mut self) -> &mut MetadataBuilder {
+    fn buffer(&mut self) -> &mut ValueBuilder {
+        self.buffer_and_metadata_builder().0
+    }
+
+    fn metadata_builder(&mut self) -> &mut dyn MetadataBuilder {
+        self.buffer_and_metadata_builder().1

Review Comment:
   These used to manually `match self`



##########
parquet-variant/src/builder.rs:
##########
@@ -530,32 +545,30 @@ impl MetadataBuilder {
         metadata_buffer.push(0x01 | (is_sorted as u8) << 4 | ((offset_size - 
1) << 6));
 
         // Write dictionary size
-        write_offset(&mut metadata_buffer, nkeys, offset_size);
+        write_offset(metadata_buffer, nkeys, offset_size);
 
         // Write offsets
         let mut cur_offset = 0;
         for key in field_names.iter() {
-            write_offset(&mut metadata_buffer, cur_offset, offset_size);
+            write_offset(metadata_buffer, cur_offset, offset_size);
             cur_offset += key.len();
         }
         // Write final offset
-        write_offset(&mut metadata_buffer, cur_offset, offset_size);
+        write_offset(metadata_buffer, cur_offset, offset_size);
 
         // Write string data
         for key in field_names {
             metadata_buffer.extend_from_slice(key.as_bytes());
         }
-
-        metadata_buffer
     }
 
     /// Return the inner buffer, without finalizing any in progress metadata.
-    pub(crate) fn take_buffer(self) -> Vec<u8> {
+    pub fn into_inner(self) -> Vec<u8> {

Review Comment:
   The old name was a big weird... switched to a more traditional name



##########
parquet-variant/src/builder.rs:
##########
@@ -1068,29 +1131,20 @@ impl VariantBuilder {
         self.metadata_builder.upsert_field_name(field_name);
     }
 
-    // Returns validate_unique_fields because we can no longer reference self 
once this method returns.
-    fn parent_state(&mut self) -> (ParentState<'_>, bool) {

Review Comment:
   Thanks to the new parent state constructors, this helper was actually more 
work than it's worth



##########
parquet-variant/src/builder.rs:
##########
@@ -1299,61 +1330,26 @@ impl<'a> ListBuilder<'a> {
             .inner_mut()
             .splice(starting_offset..starting_offset, bytes_to_splice);
 
-        self.parent_state.finish(starting_offset);
-        self.has_been_finished = true;
-    }
-}
-
-/// Drop implementation for ListBuilder does nothing
-/// as the `finish` method must be called to finalize the list.
-/// This is to ensure that the list is always finalized before its parent 
builder
-/// is finalized.
-impl Drop for ListBuilder<'_> {
-    fn drop(&mut self) {
-        if !self.has_been_finished {
-            self.parent_state
-                .buffer()
-                .inner_mut()
-                .truncate(self.parent_value_offset_base);
-            self.parent_state
-                .metadata_builder()
-                .field_names
-                .truncate(self.parent_metadata_offset_base);
-        }
+        self.parent_state.finish();
     }
 }
 
 /// A builder for creating [`Variant::Object`] values.
 ///
 /// See the examples on [`VariantBuilder`] for usage.
+#[derive(Debug)]
 pub struct ObjectBuilder<'a> {
     parent_state: ParentState<'a>,
     fields: IndexMap<u32, usize>, // (field_id, offset)
-    /// The starting offset in the parent's buffer where this object starts
-    parent_value_offset_base: usize,
-    /// The starting offset in the parent's metadata buffer where this object 
starts
-    /// used to truncate the written fields in `drop` if the current object 
has not been finished
-    parent_metadata_offset_base: usize,
-    /// Whether the object has been finished, the written content of the 
current object
-    /// will be truncated in `drop` if `has_been_finished` is false
-    has_been_finished: bool,
     validate_unique_fields: bool,
-    /// Set of duplicate fields to report for errors
-    duplicate_fields: HashSet<u32>,

Review Comment:
   Not needed when we immediately report duplicates. 
   
   And removing it makes the new parent state approach a lot simpler.



##########
parquet-variant/src/builder.rs:
##########
@@ -2475,29 +2444,26 @@ mod tests {
         let mut builder = 
VariantBuilder::new().with_validate_unique_fields(true);
 
         // Root-level object with duplicates
-        let mut root_obj = builder.new_object();
-        root_obj.insert("a", 1);
-        root_obj.insert("b", 2);
-        root_obj.insert("a", 3);
-        root_obj.insert("b", 4);
-
-        let result = root_obj.finish();
+        let result = builder
+            .new_object()
+            .with_field("a", 1)
+            .with_field("b", 2)
+            .try_with_field("a", 3);

Review Comment:
   The duplicate field is now detected immediately upon insertion, not at finish



##########
parquet-variant-json/src/from_json.rs:
##########
@@ -630,10 +630,10 @@ mod test {
         // Verify metadata size = 1 + 2 + 2 * 497 + 3 * 496
         assert_eq!(metadata.len(), 2485);
         // Verify value size.
-        // Size of innermost_list: 1 + 1 + 258 + 256 = 516
-        // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128
-        // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313
-        assert_eq!(value.len(), 34082313);
+        // Size of innermost_list: 1 + 1 + 2*(128 + 1) + 2*128 = 516
+        // Size of inner object: 1 + 4 + 2*256 + 3*(256 + 1) + 256 * 516 = 
133384
+        // Size of json: 1 + 4 + 2*256 + 4*(256 + 1) + 256 * 133384 = 34147849
+        assert_eq!(value.len(), 34147849);

Review Comment:
   When I changed the `ParentState` protocol to eagerly insert new field names, 
it upset the delicate balance in this test:
   * Previously, it would start the outer object, then register the first inner 
object without registering its field name, and then register 256 fields; that 
meant the dictionary's first 256 entries were all contained inside the first 
inner object, and could be encoded as a single byte.
   * Now, the first inner object's field name is registered immediately upon 
creation, which means the dictionary contains 257 entries by the time the inner 
object gets finalized. Which requires 2 bytes to encode each field id.
   
   The size change is tiny -- less than 0.2% -- and the original optimized 
layout requires an extremely precise choreography that is unlikely to arise 
often in the wild. So IMO the change is acceptable.



##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -186,10 +185,7 @@ impl VariantArrayBuilder {
     /// assert!(variant_array.value(1).as_object().is_some());
     ///  ```
     pub fn variant_builder(&mut self) -> VariantArrayVariantBuilder<'_> {
-        // append directly into the metadata and value buffers
-        let metadata_buffer = std::mem::take(&mut self.metadata_buffer);
-        let value_buffer = std::mem::take(&mut self.value_buffer);
-        VariantArrayVariantBuilder::new(self, metadata_buffer, value_buffer)
+        VariantArrayVariantBuilder::new(self)

Review Comment:
   Move all the magic into the constructor (this could be done as a separate PR)



##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -212,22 +208,19 @@ pub struct VariantArrayVariantBuilder<'a> {
     /// have been moved into the variant builder, and must be returned on
     /// drop
     array_builder: &'a mut VariantArrayBuilder,
-    /// Builder for the in progress variant value, temporarily owns the buffers
-    /// from `array_builder`
-    variant_builder: VariantBuilder,
 }
 
 impl VariantBuilderExt for VariantArrayVariantBuilder<'_> {
     fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
-        self.variant_builder.append_value(value);
+        ValueBuilder::try_append_variant(self.parent_state(), 
value.into()).unwrap()

Review Comment:
   Missed one
   ```suggestion
           ValueBuilder::append_variant(self.parent_state(), value.into())
   ```



##########
parquet-variant/src/builder.rs:
##########
@@ -885,40 +1000,6 @@ impl ParentState<'_> {
 /// );
 ///
 /// ```
-/// # Example: Reusing Buffers

Review Comment:
   This API was only needed by the `VariantArrayBuilder`, which now uses a very 
different approach.



##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -272,84 +257,46 @@ impl<'a> VariantArrayVariantBuilder<'a> {
 
         let metadata_offset = self.metadata_offset;
         let value_offset = self.value_offset;
-        // get the buffers back from the variant builder
-        let (metadata_buffer, value_buffer) = std::mem::take(&mut 
self.variant_builder).finish();
+
+        let metadata_builder = &mut self.array_builder.metadata_builder;
+        let value_builder = &mut self.array_builder.value_builder;
 
         // Sanity Check: if the buffers got smaller, something went wrong 
(previous data was lost)
-        let metadata_len = metadata_buffer
-            .len()
-            .checked_sub(metadata_offset)
-            .expect("metadata length decreased unexpectedly");
-        let value_len = value_buffer
-            .len()
-            .checked_sub(value_offset)
-            .expect("value length decreased unexpectedly");
+        assert!(
+            metadata_offset <= metadata_builder.offset(),
+            "metadata length decreased unexpectedly"
+        );
+        assert!(
+            value_offset <= value_builder.offset(),
+            "value length decreased unexpectedly"
+        );
+
+        metadata_builder.finish();
 
         // commit the changes by putting the
         // offsets and lengths into the parent array builder.
-        self.array_builder
-            .metadata_locations
-            .push((metadata_offset, metadata_len));
-        self.array_builder
-            .value_locations
-            .push((value_offset, value_len));
+        self.array_builder.metadata_offsets.push(metadata_offset);
+        self.array_builder.value_offsets.push(value_offset);
         self.array_builder.nulls.append_non_null();
-        // put the buffers back into the array builder
-        self.array_builder.metadata_buffer = metadata_buffer;
-        self.array_builder.value_buffer = value_buffer;
     }
 }
 
+// Make it harder for people to accidentally forget to `finish` the builder.
 impl Drop for VariantArrayVariantBuilder<'_> {
-    /// If the builder was not finished, roll back any changes made to the
-    /// underlying buffers (by truncating them)
-    fn drop(&mut self) {
-        if self.finished {
-            return;
-        }
-
-        // if the object was not finished, need to rollback any changes by
-        // truncating the buffers to the original offsets
-        let metadata_offset = self.metadata_offset;
-        let value_offset = self.value_offset;
-
-        // get the buffers back from the variant builder
-        let (mut metadata_buffer, mut value_buffer) =
-            std::mem::take(&mut self.variant_builder).into_buffers();
-
-        // Sanity Check: if the buffers got smaller, something went wrong 
(previous data was lost) so panic immediately
-        metadata_buffer
-            .len()
-            .checked_sub(metadata_offset)
-            .expect("metadata length decreased unexpectedly");
-        value_buffer
-            .len()
-            .checked_sub(value_offset)
-            .expect("value length decreased unexpectedly");
-
-        // Note this truncate is fast because truncate doesn't free any memory:
-        // it just has to drop elements (and u8 doesn't have a destructor)
-        metadata_buffer.truncate(metadata_offset);
-        value_buffer.truncate(value_offset);
-
-        // put the buffers back into the array builder
-        self.array_builder.metadata_buffer = metadata_buffer;
-        self.array_builder.value_buffer = value_buffer;

Review Comment:
   The new `ParentState` approach eliminated all this code.



##########
parquet-variant-compute/src/variant_array_builder.rs:
##########
@@ -272,84 +257,46 @@ impl<'a> VariantArrayVariantBuilder<'a> {
 
         let metadata_offset = self.metadata_offset;
         let value_offset = self.value_offset;
-        // get the buffers back from the variant builder
-        let (metadata_buffer, value_buffer) = std::mem::take(&mut 
self.variant_builder).finish();
+
+        let metadata_builder = &mut self.array_builder.metadata_builder;
+        let value_builder = &mut self.array_builder.value_builder;
 
         // Sanity Check: if the buffers got smaller, something went wrong 
(previous data was lost)
-        let metadata_len = metadata_buffer
-            .len()
-            .checked_sub(metadata_offset)
-            .expect("metadata length decreased unexpectedly");
-        let value_len = value_buffer
-            .len()
-            .checked_sub(value_offset)
-            .expect("value length decreased unexpectedly");
+        assert!(
+            metadata_offset <= metadata_builder.offset(),
+            "metadata length decreased unexpectedly"
+        );
+        assert!(
+            value_offset <= value_builder.offset(),
+            "value length decreased unexpectedly"
+        );
+
+        metadata_builder.finish();
 
         // commit the changes by putting the
         // offsets and lengths into the parent array builder.
-        self.array_builder
-            .metadata_locations
-            .push((metadata_offset, metadata_len));
-        self.array_builder
-            .value_locations
-            .push((value_offset, value_len));
+        self.array_builder.metadata_offsets.push(metadata_offset);
+        self.array_builder.value_offsets.push(value_offset);
         self.array_builder.nulls.append_non_null();
-        // put the buffers back into the array builder
-        self.array_builder.metadata_buffer = metadata_buffer;
-        self.array_builder.value_buffer = value_buffer;
     }
 }
 
+// Make it harder for people to accidentally forget to `finish` the builder.
 impl Drop for VariantArrayVariantBuilder<'_> {
-    /// If the builder was not finished, roll back any changes made to the
-    /// underlying buffers (by truncating them)
-    fn drop(&mut self) {
-        if self.finished {
-            return;
-        }
-
-        // if the object was not finished, need to rollback any changes by
-        // truncating the buffers to the original offsets
-        let metadata_offset = self.metadata_offset;
-        let value_offset = self.value_offset;
-
-        // get the buffers back from the variant builder
-        let (mut metadata_buffer, mut value_buffer) =
-            std::mem::take(&mut self.variant_builder).into_buffers();
-
-        // Sanity Check: if the buffers got smaller, something went wrong 
(previous data was lost) so panic immediately
-        metadata_buffer
-            .len()
-            .checked_sub(metadata_offset)
-            .expect("metadata length decreased unexpectedly");
-        value_buffer
-            .len()
-            .checked_sub(value_offset)
-            .expect("value length decreased unexpectedly");
-
-        // Note this truncate is fast because truncate doesn't free any memory:
-        // it just has to drop elements (and u8 doesn't have a destructor)
-        metadata_buffer.truncate(metadata_offset);
-        value_buffer.truncate(value_offset);
-
-        // put the buffers back into the array builder
-        self.array_builder.metadata_buffer = metadata_buffer;
-        self.array_builder.value_buffer = value_buffer;
-    }
+    fn drop(&mut self) {}
 }
 
-fn binary_view_array_from_buffers(
-    buffer: Vec<u8>,
-    locations: Vec<(usize, usize)>,
-) -> BinaryViewArray {
-    let mut builder = BinaryViewBuilder::with_capacity(locations.len());
+fn binary_view_array_from_buffers(buffer: Vec<u8>, mut offsets: Vec<usize>) -> 
BinaryViewArray {
+    let mut builder = BinaryViewBuilder::with_capacity(offsets.len());
+    offsets.push(buffer.len());

Review Comment:
   ```suggestion
       // Push the final end offset only to simplify the loop below
       offsets.push(buffer.len());
   ```



##########
parquet-variant/src/builder.rs:
##########
@@ -285,97 +264,78 @@ impl ValueBuffer {
         Ok(())
     }
 
-    fn offset(&self) -> usize {
+    pub fn offset(&self) -> usize {
         self.0.len()
     }
 
-    fn new_object<'a>(
-        &'a mut self,
-        metadata_builder: &'a mut MetadataBuilder,
-    ) -> ObjectBuilder<'a> {
-        let parent_state = ParentState::Variant {
-            buffer: self,
-            metadata_builder,
-        };
-        let validate_unique_fields = false;
-        ObjectBuilder::new(parent_state, validate_unique_fields)
-    }
-
-    fn new_list<'a>(&'a mut self, metadata_builder: &'a mut MetadataBuilder) 
-> ListBuilder<'a> {
-        let parent_state = ParentState::Variant {
-            buffer: self,
-            metadata_builder,
-        };
-        let validate_unique_fields = false;
-        ListBuilder::new(parent_state, validate_unique_fields)
-    }

Review Comment:
   Replaced by the new `ParentState` passing API



##########
parquet-variant/src/builder.rs:
##########
@@ -234,8 +223,8 @@ impl ValueBuffer {
         self.append_slice(value.as_bytes());
     }
 
-    fn append_object(&mut self, metadata_builder: &mut MetadataBuilder, obj: 
VariantObject) {
-        let mut object_builder = self.new_object(metadata_builder);
+    fn append_object(state: ParentState<'_>, obj: VariantObject) {
+        let mut object_builder = ObjectBuilder::new(state, false);

Review Comment:
   By taking a `ParentState` created by our caller, we allow our caller to 
create the correct flavor (variant, list, object). It also reduces the 
complexity of tracking and managing the value and metadata builders separately.



##########
parquet-variant/src/builder.rs:
##########
@@ -589,82 +602,202 @@ impl<S: AsRef<str>> Extend<S> for MetadataBuilder {
 /// parent, and we cannot "split" a mutable reference across two objects 
(parent state and the child
 /// builder that uses it). So everything has to be here. Rust layout 
optimizations should treat the
 /// variants as a union, so that accessing a `buffer` or `metadata_builder` is 
branch-free.
-enum ParentState<'a> {
+#[derive(Debug)]
+pub enum ParentState<'a> {
     Variant {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
+        finished: bool,
     },
     List {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
-        parent_value_offset_base: usize,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
         offsets: &'a mut Vec<usize>,
+        saved_offsets_size: usize,

Review Comment:
   When the parent is a list builder, we eagerly append a starting offset for 
the new array element to be created, rolling back the change if `finish` is not 
called.



##########
parquet-variant/src/builder.rs:
##########
@@ -285,97 +264,78 @@ impl ValueBuffer {
         Ok(())
     }
 
-    fn offset(&self) -> usize {
+    pub fn offset(&self) -> usize {
         self.0.len()
     }
 
-    fn new_object<'a>(
-        &'a mut self,
-        metadata_builder: &'a mut MetadataBuilder,
-    ) -> ObjectBuilder<'a> {
-        let parent_state = ParentState::Variant {
-            buffer: self,
-            metadata_builder,
-        };
-        let validate_unique_fields = false;
-        ObjectBuilder::new(parent_state, validate_unique_fields)
-    }
-
-    fn new_list<'a>(&'a mut self, metadata_builder: &'a mut MetadataBuilder) 
-> ListBuilder<'a> {
-        let parent_state = ParentState::Variant {
-            buffer: self,
-            metadata_builder,
-        };
-        let validate_unique_fields = false;
-        ListBuilder::new(parent_state, validate_unique_fields)
-    }
-
-    /// Appends a variant to the buffer.
+    /// Appends a variant to the provided [`ParentState`] instance.
     ///
     /// # Panics
     ///
     /// This method will panic if the variant contains duplicate field names 
in objects
-    /// when validation is enabled. For a fallible version, use 
[`ValueBuffer::try_append_variant`]
-    fn append_variant<'m, 'd>(
-        &mut self,
-        variant: Variant<'m, 'd>,
-        metadata_builder: &mut MetadataBuilder,
-    ) {
+    /// when validation is enabled. For a fallible version, use 
[`ValueBuilder::try_append_variant`]
+    fn append_variant(mut state: ParentState<'_>, variant: Variant<'_, '_>) {
+        let buffer = state.buffer();
         match variant {
-            Variant::Null => self.append_null(),
-            Variant::BooleanTrue => self.append_bool(true),
-            Variant::BooleanFalse => self.append_bool(false),
-            Variant::Int8(v) => self.append_int8(v),
-            Variant::Int16(v) => self.append_int16(v),
-            Variant::Int32(v) => self.append_int32(v),
-            Variant::Int64(v) => self.append_int64(v),
-            Variant::Date(v) => self.append_date(v),
-            Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
-            Variant::TimestampNtzMicros(v) => 
self.append_timestamp_ntz_micros(v),
-            Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
-            Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
-            Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
-            Variant::Float(v) => self.append_float(v),
-            Variant::Double(v) => self.append_double(v),
-            Variant::Binary(v) => self.append_binary(v),
-            Variant::String(s) => self.append_string(s),
-            Variant::ShortString(s) => self.append_short_string(s),
-            Variant::Object(obj) => self.append_object(metadata_builder, obj),
-            Variant::List(list) => self.append_list(metadata_builder, list),
-            Variant::Time(v) => self.append_time_micros(v),

Review Comment:
   This `Time` match arm just moved up to `Date` where it arguably belongs



##########
parquet-variant/src/builder.rs:
##########
@@ -504,17 +517,19 @@ impl MetadataBuilder {
         self.field_names.iter().map(|k| k.len()).sum()
     }
 
-    fn finish(self) -> Vec<u8> {
+    pub fn offset(&self) -> usize {
+        self.metadata_buffer.len()
+    }
+
+    pub fn finish(&mut self) {
         let nkeys = self.num_field_names();
 
         // Calculate metadata size
         let total_dict_size: usize = self.metadata_size();
 
-        let Self {
-            field_names,
-            is_sorted,
-            mut metadata_buffer,
-        } = self;
+        let field_names = std::mem::take(&mut self.field_names);
+        let is_sorted = std::mem::take(&mut self.is_sorted);
+        let metadata_buffer = &mut self.metadata_buffer;

Review Comment:
   MetadataBuilderXX is now reusable -- calling `finish` just appends the 
resulting variant metadata bytes to the underlying buffer and resets the field 
name tracking for the next one. Very useful for the `VariantArrayBuilder` use 
case.



##########
parquet-variant/src/builder.rs:
##########
@@ -285,97 +264,78 @@ impl ValueBuffer {
         Ok(())
     }
 
-    fn offset(&self) -> usize {
+    pub fn offset(&self) -> usize {
         self.0.len()
     }
 
-    fn new_object<'a>(
-        &'a mut self,
-        metadata_builder: &'a mut MetadataBuilder,
-    ) -> ObjectBuilder<'a> {
-        let parent_state = ParentState::Variant {
-            buffer: self,
-            metadata_builder,
-        };
-        let validate_unique_fields = false;
-        ObjectBuilder::new(parent_state, validate_unique_fields)
-    }
-
-    fn new_list<'a>(&'a mut self, metadata_builder: &'a mut MetadataBuilder) 
-> ListBuilder<'a> {
-        let parent_state = ParentState::Variant {
-            buffer: self,
-            metadata_builder,
-        };
-        let validate_unique_fields = false;
-        ListBuilder::new(parent_state, validate_unique_fields)
-    }
-
-    /// Appends a variant to the buffer.
+    /// Appends a variant to the provided [`ParentState`] instance.
     ///
     /// # Panics
     ///
     /// This method will panic if the variant contains duplicate field names 
in objects
-    /// when validation is enabled. For a fallible version, use 
[`ValueBuffer::try_append_variant`]
-    fn append_variant<'m, 'd>(
-        &mut self,
-        variant: Variant<'m, 'd>,
-        metadata_builder: &mut MetadataBuilder,
-    ) {
+    /// when validation is enabled. For a fallible version, use 
[`ValueBuilder::try_append_variant`]
+    fn append_variant(mut state: ParentState<'_>, variant: Variant<'_, '_>) {
+        let buffer = state.buffer();
         match variant {
-            Variant::Null => self.append_null(),
-            Variant::BooleanTrue => self.append_bool(true),
-            Variant::BooleanFalse => self.append_bool(false),
-            Variant::Int8(v) => self.append_int8(v),
-            Variant::Int16(v) => self.append_int16(v),
-            Variant::Int32(v) => self.append_int32(v),
-            Variant::Int64(v) => self.append_int64(v),
-            Variant::Date(v) => self.append_date(v),
-            Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
-            Variant::TimestampNtzMicros(v) => 
self.append_timestamp_ntz_micros(v),
-            Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
-            Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
-            Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
-            Variant::Float(v) => self.append_float(v),
-            Variant::Double(v) => self.append_double(v),
-            Variant::Binary(v) => self.append_binary(v),
-            Variant::String(s) => self.append_string(s),
-            Variant::ShortString(s) => self.append_short_string(s),
-            Variant::Object(obj) => self.append_object(metadata_builder, obj),
-            Variant::List(list) => self.append_list(metadata_builder, list),
-            Variant::Time(v) => self.append_time_micros(v),
+            Variant::Null => buffer.append_null(),
+            Variant::BooleanTrue => buffer.append_bool(true),
+            Variant::BooleanFalse => buffer.append_bool(false),
+            Variant::Int8(v) => buffer.append_int8(v),
+            Variant::Int16(v) => buffer.append_int16(v),
+            Variant::Int32(v) => buffer.append_int32(v),
+            Variant::Int64(v) => buffer.append_int64(v),
+            Variant::Date(v) => buffer.append_date(v),
+            Variant::Time(v) => buffer.append_time_micros(v),
+            Variant::TimestampMicros(v) => buffer.append_timestamp_micros(v),
+            Variant::TimestampNtzMicros(v) => 
buffer.append_timestamp_ntz_micros(v),
+            Variant::Decimal4(decimal4) => buffer.append_decimal4(decimal4),
+            Variant::Decimal8(decimal8) => buffer.append_decimal8(decimal8),
+            Variant::Decimal16(decimal16) => 
buffer.append_decimal16(decimal16),
+            Variant::Float(v) => buffer.append_float(v),
+            Variant::Double(v) => buffer.append_double(v),
+            Variant::Binary(v) => buffer.append_binary(v),
+            Variant::String(s) => buffer.append_string(s),
+            Variant::ShortString(s) => buffer.append_short_string(s),
+            Variant::Object(obj) => return Self::append_object(state, obj),
+            Variant::List(list) => return Self::append_list(state, list),
         }
+        state.finish();

Review Comment:
   The object and list builders consume `state` and return early; everyone else 
has to `finish` it manually here.



##########
parquet-variant/src/builder.rs:
##########
@@ -437,13 +397,76 @@ impl ValueBuffer {
     }
 }
 
+pub trait MetadataBuilder: std::fmt::Debug {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError>;
+    fn field_name(&self, field_id: usize) -> &str;
+    fn num_field_names(&self) -> usize;
+    fn truncate_field_names(&mut self, new_size: usize);
+}
+
+impl MetadataBuilder for MetadataBuilderXX {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError> {
+        Ok(self.upsert_field_name(field_name))
+    }
+    fn field_name(&self, field_id: usize) -> &str {
+        self.field_name(field_id)
+    }
+    fn num_field_names(&self) -> usize {
+        self.num_field_names()
+    }
+    fn truncate_field_names(&mut self, new_size: usize) {
+        self.field_names.truncate(new_size)
+    }
+}
+
+#[derive(Debug)]
+pub struct ReadOnlyMetadataBuilder<'m> {
+    metadata: VariantMetadata<'m>,
+    known_field_names: HashMap<&'m str, usize>,
+}
+
+impl<'m> ReadOnlyMetadataBuilder<'m> {
+    pub fn new(metadata: VariantMetadata<'m>) -> Self {
+        Self {
+            metadata,
+            known_field_names: HashMap::new(),
+        }
+    }
+}
+
+impl MetadataBuilder for ReadOnlyMetadataBuilder<'_> {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError> {
+        if let Some(field_id) = self.known_field_names.get(field_name) {
+            return Ok(*field_id as u32);
+        }
+
+        // TODO: Be (a lot) smarter here!
+        let Some(field_id) = self.metadata.iter().position(|name| name == 
field_name) else {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Field name '{field_name}' not found in metadata",
+            )));
+        };
+        self.known_field_names.insert(self.metadata.get_infallible(field_id), 
field_id);
+        Ok(field_id as u32)
+    }
+    fn field_name(&self, field_id: usize) -> &str {
+        &self.metadata[field_id]
+    }
+    fn num_field_names(&self) -> usize {
+        self.metadata.len()
+    }
+    fn truncate_field_names(&mut self, new_size: usize) {
+        debug_assert_eq!(self.metadata.len(), new_size);
+    }
+}
+
 /// Builder for constructing metadata for [`Variant`] values.
 ///
 /// This is used internally by the [`VariantBuilder`] to construct the metadata
 ///
 /// You can use an existing `Vec<u8>` as the metadata buffer by using the 
`from` impl.
 #[derive(Default, Debug)]
-struct MetadataBuilder {
+pub struct MetadataBuilderXX {

Review Comment:
   Yeah, this needs a real name. Ideas welcome!



##########
parquet-variant/src/builder.rs:
##########
@@ -437,13 +397,76 @@ impl ValueBuffer {
     }
 }
 
+pub trait MetadataBuilder: std::fmt::Debug {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError>;
+    fn field_name(&self, field_id: usize) -> &str;
+    fn num_field_names(&self) -> usize;
+    fn truncate_field_names(&mut self, new_size: usize);
+}
+
+impl MetadataBuilder for MetadataBuilderXX {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError> {
+        Ok(self.upsert_field_name(field_name))
+    }
+    fn field_name(&self, field_id: usize) -> &str {
+        self.field_name(field_id)
+    }
+    fn num_field_names(&self) -> usize {
+        self.num_field_names()
+    }
+    fn truncate_field_names(&mut self, new_size: usize) {
+        self.field_names.truncate(new_size)
+    }
+}
+
+#[derive(Debug)]
+pub struct ReadOnlyMetadataBuilder<'m> {
+    metadata: VariantMetadata<'m>,
+    known_field_names: HashMap<&'m str, usize>,
+}
+
+impl<'m> ReadOnlyMetadataBuilder<'m> {
+    pub fn new(metadata: VariantMetadata<'m>) -> Self {
+        Self {
+            metadata,
+            known_field_names: HashMap::new(),
+        }
+    }
+}
+
+impl MetadataBuilder for ReadOnlyMetadataBuilder<'_> {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError> {
+        if let Some(field_id) = self.known_field_names.get(field_name) {
+            return Ok(*field_id as u32);
+        }
+
+        // TODO: Be (a lot) smarter here!

Review Comment:
   See https://github.com/apache/arrow-rs/pull/7915/, just didn't copy it over



##########
parquet-variant/src/builder.rs:
##########
@@ -589,82 +602,202 @@ impl<S: AsRef<str>> Extend<S> for MetadataBuilder {
 /// parent, and we cannot "split" a mutable reference across two objects 
(parent state and the child
 /// builder that uses it). So everything has to be here. Rust layout 
optimizations should treat the
 /// variants as a union, so that accessing a `buffer` or `metadata_builder` is 
branch-free.
-enum ParentState<'a> {
+#[derive(Debug)]
+pub enum ParentState<'a> {
     Variant {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,

Review Comment:
   All three sub-types reset the value and metadata builders back to their 
original sizes if `finish` is not called.



##########
parquet-variant/src/builder.rs:
##########
@@ -1149,27 +1200,14 @@ impl VariantBuilder {
 pub struct ListBuilder<'a> {
     parent_state: ParentState<'a>,
     offsets: Vec<usize>,
-    /// The starting offset in the parent's buffer where this list starts
-    parent_value_offset_base: usize,
-    /// The starting offset in the parent's metadata buffer where this list 
starts
-    /// used to truncate the written fields in `drop` if the current list has 
not been finished
-    parent_metadata_offset_base: usize,
-    /// Whether the list has been finished, the written content of the current 
list
-    /// will be truncated in `drop` if `has_been_finished` is false
-    has_been_finished: bool,

Review Comment:
   Tracked by the parent state now.



##########
parquet-variant/src/builder.rs:
##########
@@ -1227,14 +1265,8 @@ impl<'a> ListBuilder<'a> {
         &mut self,
         value: T,
     ) -> Result<(), ArrowError> {
-        let (buffer, metadata_builder) = 
self.parent_state.buffer_and_metadata_builder();
-
-        let offset = buffer.offset() - self.parent_value_offset_base;
-        self.offsets.push(offset);
-
-        buffer.try_append_variant(value.into(), metadata_builder)?;
-
-        Ok(())
+        let (state, _) = self.parent_state();
+        ValueBuilder::try_append_variant(state, value.into())

Review Comment:
   Again, this used to not fully roll back on failure; now it does.



##########
parquet-variant/src/builder.rs:
##########
@@ -978,38 +1057,23 @@ impl ParentState<'_> {
 /// ```
 #[derive(Default, Debug)]
 pub struct VariantBuilder {
-    buffer: ValueBuffer,
-    metadata_builder: MetadataBuilder,
+    value_builder: ValueBuilder,
+    metadata_builder: MetadataBuilderXX,
     validate_unique_fields: bool,
 }
 
 impl VariantBuilder {
     /// Create a new VariantBuilder with new underlying buffer
     pub fn new() -> Self {
-        Self {
-            buffer: ValueBuffer::new(),
-            metadata_builder: MetadataBuilder::default(),
-            validate_unique_fields: false,
-        }
+        Self::default()
     }
 
     /// Create a new VariantBuilder with pre-existing [`VariantMetadata`].
     pub fn with_metadata(mut self, metadata: VariantMetadata) -> Self {
         self.metadata_builder.extend(metadata.iter());
-
         self
     }
 
-    /// Create a new VariantBuilder that will write the metadata and values to
-    /// the specified buffers.
-    pub fn new_with_buffers(metadata_buffer: Vec<u8>, value_buffer: Vec<u8>) 
-> Self {
-        Self {
-            buffer: ValueBuffer::from(value_buffer),
-            metadata_builder: MetadataBuilder::from(metadata_buffer),
-            validate_unique_fields: false,
-        }
-    }

Review Comment:
   More supporting code for the old `VariantArrayBuilder` approach



##########
parquet-variant/src/builder.rs:
##########
@@ -1575,22 +1536,30 @@ impl Drop for ObjectBuilder<'_> {
 pub trait VariantBuilderExt {
     fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>);
 
-    fn new_list(&mut self) -> ListBuilder<'_>;
+    fn new_list(&mut self) -> ListBuilder<'_> {
+        self.try_new_list().unwrap()
+    }
+
+    fn new_object(&mut self) -> ObjectBuilder<'_> {
+        self.try_new_object().unwrap()
+    }
 
-    fn new_object(&mut self) -> ObjectBuilder<'_>;
+    fn try_new_list(&mut self) -> Result<ListBuilder<'_>, ArrowError>;
+
+    fn try_new_object(&mut self) -> Result<ObjectBuilder<'_>, ArrowError>;

Review Comment:
   At first I just changed the `new_XX` methods to return `Result`, but that 
churned a lot of unit test code. 
   
   I'm still on the fence whether this is better in the long run tho?



##########
parquet-variant/src/builder.rs:
##########
@@ -1108,26 +1162,23 @@ impl VariantBuilder {
     /// builder.append_value(42i8);
     /// ```
     pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) 
{
-        let variant = value.into();
-        self.buffer
-            .append_variant(variant, &mut self.metadata_builder);
+        let state = ParentState::variant(&mut self.value_builder, &mut 
self.metadata_builder);
+        ValueBuilder::append_variant(state, value.into())
     }
 
     /// Append a value to the builder.
     pub fn try_append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(
         &mut self,
         value: T,
     ) -> Result<(), ArrowError> {
-        let variant = value.into();
-        self.buffer
-            .try_append_variant(variant, &mut self.metadata_builder)?;
-
-        Ok(())
+        let state = ParentState::variant(&mut self.value_builder, &mut 
self.metadata_builder);
+        ValueBuilder::try_append_variant(state, value.into())

Review Comment:
   This used to not roll back fully if the insert failed. Now it does.



##########
parquet-variant/src/builder.rs:
##########
@@ -1299,61 +1330,26 @@ impl<'a> ListBuilder<'a> {
             .inner_mut()
             .splice(starting_offset..starting_offset, bytes_to_splice);
 
-        self.parent_state.finish(starting_offset);
-        self.has_been_finished = true;
-    }
-}
-
-/// Drop implementation for ListBuilder does nothing
-/// as the `finish` method must be called to finalize the list.
-/// This is to ensure that the list is always finalized before its parent 
builder
-/// is finalized.
-impl Drop for ListBuilder<'_> {
-    fn drop(&mut self) {

Review Comment:
   No longer needed, `ParentState` drop glue does everything.



##########
parquet-variant/src/builder.rs:
##########
@@ -1385,17 +1381,8 @@ impl<'a> ObjectBuilder<'a> {
         key: &str,
         value: T,
     ) -> Result<(), ArrowError> {
-        let (buffer, metadata_builder) = 
self.parent_state.buffer_and_metadata_builder();
-
-        let field_id = metadata_builder.upsert_field_name(key);
-        let field_start = buffer.offset() - self.parent_value_offset_base;
-
-        if self.fields.insert(field_id, field_start).is_some() && 
self.validate_unique_fields {
-            self.duplicate_fields.insert(field_id);
-        }
-
-        buffer.try_append_variant(value.into(), metadata_builder)?;
-        Ok(())
+        let (state, _) = self.parent_state(key)?;
+        ValueBuilder::try_append_variant(state, value.into())

Review Comment:
   Again, this used to not roll back fully on failure.



##########
parquet-variant/src/builder.rs:
##########
@@ -2475,29 +2444,26 @@ mod tests {
         let mut builder = 
VariantBuilder::new().with_validate_unique_fields(true);
 
         // Root-level object with duplicates
-        let mut root_obj = builder.new_object();
-        root_obj.insert("a", 1);
-        root_obj.insert("b", 2);
-        root_obj.insert("a", 3);
-        root_obj.insert("b", 4);
-
-        let result = root_obj.finish();
+        let result = builder
+            .new_object()
+            .with_field("a", 1)
+            .with_field("b", 2)
+            .try_with_field("a", 3);
         assert_eq!(
             result.unwrap_err().to_string(),
-            "Invalid argument error: Duplicate field keys detected: [a, b]"
+            "Invalid argument error: Duplicate field: a"
         );
 
         // Deeply nested list -> list -> object with duplicate
         let mut outer_list = builder.new_list();
         let mut inner_list = outer_list.new_list();
-        let mut nested_obj = inner_list.new_object();
-        nested_obj.insert("x", 1);
-        nested_obj.insert("x", 2);
-
-        let nested_result = nested_obj.finish();
+        let nested_result = inner_list
+            .new_object()
+            .with_field("x", 1)
+            .try_with_field("x", 2);

Review Comment:
   Again, duplicates are now detected immediately.



##########
parquet-variant/src/builder.rs:
##########
@@ -2716,95 +2682,20 @@ mod tests {
     #[test]
     fn test_metadata_builder_from_iter_with_string_types() {
         // &str
-        let metadata = MetadataBuilder::from_iter(["a", "b", "c"]);
+        let metadata = MetadataBuilderXX::from_iter(["a", "b", "c"]);
         assert_eq!(metadata.num_field_names(), 3);
 
         // string
         let metadata =
-            MetadataBuilder::from_iter(vec!["a".to_string(), "b".to_string(), 
"c".to_string()]);
+            MetadataBuilderXX::from_iter(vec!["a".to_string(), 
"b".to_string(), "c".to_string()]);
         assert_eq!(metadata.num_field_names(), 3);
 
         // mixed types (anything that implements AsRef<str>)
         let field_names: Vec<Box<str>> = vec!["a".into(), "b".into(), 
"c".into()];
-        let metadata = MetadataBuilder::from_iter(field_names);
+        let metadata = MetadataBuilderXX::from_iter(field_names);
         assert_eq!(metadata.num_field_names(), 3);
     }
 
-    /// Test reusing buffers with nested objects
-    #[test]
-    fn test_with_existing_buffers_nested() {

Review Comment:
   This was testing support code for `VariantArrayBuilder` that has been 
deleted.



##########
parquet-variant/src/builder.rs:
##########
@@ -1299,61 +1330,26 @@ impl<'a> ListBuilder<'a> {
             .inner_mut()
             .splice(starting_offset..starting_offset, bytes_to_splice);
 
-        self.parent_state.finish(starting_offset);
-        self.has_been_finished = true;
-    }
-}
-
-/// Drop implementation for ListBuilder does nothing
-/// as the `finish` method must be called to finalize the list.
-/// This is to ensure that the list is always finalized before its parent 
builder
-/// is finalized.
-impl Drop for ListBuilder<'_> {
-    fn drop(&mut self) {
-        if !self.has_been_finished {
-            self.parent_state
-                .buffer()
-                .inner_mut()
-                .truncate(self.parent_value_offset_base);
-            self.parent_state
-                .metadata_builder()
-                .field_names
-                .truncate(self.parent_metadata_offset_base);
-        }
+        self.parent_state.finish();
     }
 }
 
 /// A builder for creating [`Variant::Object`] values.
 ///
 /// See the examples on [`VariantBuilder`] for usage.
+#[derive(Debug)]
 pub struct ObjectBuilder<'a> {
     parent_state: ParentState<'a>,
     fields: IndexMap<u32, usize>, // (field_id, offset)
-    /// The starting offset in the parent's buffer where this object starts
-    parent_value_offset_base: usize,
-    /// The starting offset in the parent's metadata buffer where this object 
starts
-    /// used to truncate the written fields in `drop` if the current object 
has not been finished
-    parent_metadata_offset_base: usize,
-    /// Whether the object has been finished, the written content of the 
current object
-    /// will be truncated in `drop` if `has_been_finished` is false
-    has_been_finished: bool,

Review Comment:
   Tracked by `ParentState` now.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to