klion26 commented on code in PR #8167:
URL: https://github.com/apache/arrow-rs/pull/8167#discussion_r2287553855


##########
parquet-variant/src/builder.rs:
##########
@@ -437,13 +397,76 @@ impl ValueBuffer {
     }
 }
 
+pub trait MetadataBuilder: std::fmt::Debug {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError>;
+    fn field_name(&self, field_id: usize) -> &str;
+    fn num_field_names(&self) -> usize;
+    fn truncate_field_names(&mut self, new_size: usize);
+}
+
+impl MetadataBuilder for MetadataBuilderXX {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError> {
+        Ok(self.upsert_field_name(field_name))
+    }
+    fn field_name(&self, field_id: usize) -> &str {
+        self.field_name(field_id)
+    }
+    fn num_field_names(&self) -> usize {
+        self.num_field_names()
+    }
+    fn truncate_field_names(&mut self, new_size: usize) {
+        self.field_names.truncate(new_size)
+    }
+}
+
+#[derive(Debug)]
+pub struct ReadOnlyMetadataBuilder<'m> {
+    metadata: VariantMetadata<'m>,
+    known_field_names: HashMap<&'m str, usize>,
+}
+
+impl<'m> ReadOnlyMetadataBuilder<'m> {
+    pub fn new(metadata: VariantMetadata<'m>) -> Self {
+        Self {
+            metadata,
+            known_field_names: HashMap::new(),
+        }
+    }
+}
+
+impl MetadataBuilder for ReadOnlyMetadataBuilder<'_> {
+    fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, 
ArrowError> {
+        if let Some(field_id) = self.known_field_names.get(field_name) {
+            return Ok(*field_id as u32);
+        }
+
+        // TODO: Be (a lot) smarter here!
+        let Some(field_id) = self.metadata.iter().position(|name| name == 
field_name) else {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Field name '{field_name}' not found in metadata",
+            )));
+        };
+        self.known_field_names.insert(self.metadata.get_infallible(field_id), 
field_id);

Review Comment:
   Are we not filling `known_field_names` in at initialization due to 
performance considerations?



##########
parquet-variant/src/builder.rs:
##########
@@ -589,82 +602,202 @@ impl<S: AsRef<str>> Extend<S> for MetadataBuilder {
 /// parent, and we cannot "split" a mutable reference across two objects 
(parent state and the child
 /// builder that uses it). So everything has to be here. Rust layout 
optimizations should treat the
 /// variants as a union, so that accessing a `buffer` or `metadata_builder` is 
branch-free.
-enum ParentState<'a> {
+#[derive(Debug)]
+pub enum ParentState<'a> {
     Variant {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
+        finished: bool,
     },
     List {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
-        parent_value_offset_base: usize,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
         offsets: &'a mut Vec<usize>,
+        saved_offsets_size: usize,
+        finished: bool,
     },
     Object {
-        buffer: &'a mut ValueBuffer,
-        metadata_builder: &'a mut MetadataBuilder,
+        buffer: &'a mut ValueBuilder,
+        saved_buffer_offset: usize,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        saved_metadata_builder_dict_size: usize,
         fields: &'a mut IndexMap<u32, usize>,
-        field_name: &'a str,
-        parent_value_offset_base: usize,
+        saved_fields_size: usize,
+        finished: bool,
     },
 }
 
-impl ParentState<'_> {
-    fn buffer(&mut self) -> &mut ValueBuffer {
-        match self {
-            ParentState::Variant { buffer, .. } => buffer,
-            ParentState::List { buffer, .. } => buffer,
-            ParentState::Object { buffer, .. } => buffer,
+impl<'a> ParentState<'a> {
+    pub fn variant(
+        buffer: &'a mut ValueBuilder,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+    ) -> Self {
+        ParentState::Variant {
+            saved_buffer_offset: buffer.offset(),
+            saved_metadata_builder_dict_size: 
metadata_builder.num_field_names(),
+            buffer,
+            metadata_builder,
+            finished: false,
+        }
+    }
+
+    pub fn list(
+        buffer: &'a mut ValueBuilder,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        offsets: &'a mut Vec<usize>,
+        saved_parent_buffer_offset: usize,
+    ) -> Self {
+        // The saved_parent_buffer_offset is the buffer size as of when the 
parent builder was
+        // constructed. The saved_buffer_offset is the buffer size as of now 
(when a child builder
+        // is created). The variant field_offset entry for this list element 
is their difference.
+        let saved_buffer_offset = buffer.offset();
+        let saved_offsets_size = offsets.len();
+        offsets.push(saved_buffer_offset - saved_parent_buffer_offset);
+
+        ParentState::List {
+            saved_metadata_builder_dict_size: 
metadata_builder.num_field_names(),
+            saved_buffer_offset,
+            saved_offsets_size,
+            metadata_builder,
+            buffer,
+            offsets,
+            finished: false,
+        }
+    }
+
+    pub fn object(
+        buffer: &'a mut ValueBuilder,
+        metadata_builder: &'a mut dyn MetadataBuilder,
+        fields: &'a mut IndexMap<u32, usize>,
+        saved_parent_buffer_offset: usize,
+        field_name: &str,
+        validate_unique_fields: bool,
+    ) -> Result<Self, ArrowError> {
+        // The saved_parent_buffer_offset is the buffer size as of when the 
parent builder was
+        // constructed. The saved_buffer_offset is the buffer size as of now 
(when a child builder
+        // is created). The variant field_offset entry for this field is their 
difference.
+        let saved_buffer_offset = buffer.offset();
+        let saved_fields_size = fields.len();
+        let saved_metadata_builder_dict_size = 
metadata_builder.num_field_names();
+        let field_id = metadata_builder.try_upsert_field_name(field_name)?;
+        let field_start = saved_buffer_offset - saved_parent_buffer_offset;
+        if fields.insert(field_id, field_start).is_some() && 
validate_unique_fields {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Duplicate field: {field_name}"
+            )));
         }
+
+        Ok(ParentState::Object {
+            saved_metadata_builder_dict_size,
+            saved_buffer_offset,
+            saved_fields_size,
+            buffer,
+            metadata_builder,
+            fields,
+            finished: false,
+        })
     }
 
-    fn metadata_builder(&mut self) -> &mut MetadataBuilder {
+    fn buffer(&mut self) -> &mut ValueBuilder {
+        self.buffer_and_metadata_builder().0
+    }
+
+    fn metadata_builder(&mut self) -> &mut dyn MetadataBuilder {
+        self.buffer_and_metadata_builder().1
+    }
+
+    fn saved_buffer_offset(&mut self) -> usize {
         match self {
             ParentState::Variant {
-                metadata_builder, ..
-            } => metadata_builder,
-            ParentState::List {
-                metadata_builder, ..
-            } => metadata_builder,
-            ParentState::Object {
-                metadata_builder, ..
-            } => metadata_builder,
+                saved_buffer_offset,
+                ..
+            }
+            | ParentState::List {
+                saved_buffer_offset,
+                ..
+            }
+            | ParentState::Object {
+                saved_buffer_offset,
+                ..
+            } => *saved_buffer_offset,
         }
     }
 
-    // Performs any parent-specific aspects of finishing, after the child has 
appended all necessary
-    // bytes to the parent's value buffer. ListBuilder records the new value's 
starting offset;
-    // ObjectBuilder associates the new value's starting offset with its field 
id; VariantBuilder
-    // doesn't need anything special.
-    fn finish(&mut self, starting_offset: usize) {
+    fn is_finished(&mut self) -> &mut bool {
+        match self {
+            ParentState::Variant { finished, .. }
+            | ParentState::List { finished, .. }
+            | ParentState::Object { finished, .. } => finished,
+        }
+    }
+
+    // Mark the insertion as having succeeded.
+    fn finish(&mut self) {
+        *self.is_finished() = true
+    }
+
+    // Performs any parent-specific aspects of rolling back a builder if an 
insertion failed.
+    fn rollback(&mut self) {

Review Comment:
   Will `rollback_if_needed` be better here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to