alamb commented on code in PR #8185: URL: https://github.com/apache/arrow-rs/pull/8185#discussion_r2288977440
########## parquet-variant/src/builder.rs: ########## @@ -589,72 +582,190 @@ impl<S: AsRef<str>> Extend<S> for MetadataBuilder { /// parent, and we cannot "split" a mutable reference across two objects (parent state and the child /// builder that uses it). So everything has to be here. Rust layout optimizations should treat the /// variants as a union, so that accessing a `buffer` or `metadata_builder` is branch-free. +#[derive(Debug)] enum ParentState<'a> { Variant { buffer: &'a mut ValueBuffer, + saved_buffer_offset: usize, metadata_builder: &'a mut MetadataBuilder, + saved_metadata_builder_dict_size: usize, + finished: bool, }, List { buffer: &'a mut ValueBuffer, + saved_buffer_offset: usize, metadata_builder: &'a mut MetadataBuilder, - parent_value_offset_base: usize, + saved_metadata_builder_dict_size: usize, offsets: &'a mut Vec<usize>, + saved_offsets_size: usize, + finished: bool, }, Object { buffer: &'a mut ValueBuffer, + saved_buffer_offset: usize, metadata_builder: &'a mut MetadataBuilder, + saved_metadata_builder_dict_size: usize, fields: &'a mut IndexMap<u32, usize>, - field_name: &'a str, - parent_value_offset_base: usize, + saved_fields_size: usize, + finished: bool, }, } -impl ParentState<'_> { - fn buffer(&mut self) -> &mut ValueBuffer { - match self { - ParentState::Variant { buffer, .. } => buffer, - ParentState::List { buffer, .. } => buffer, - ParentState::Object { buffer, .. } => buffer, +impl<'a> ParentState<'a> { + fn variant(buffer: &'a mut ValueBuffer, metadata_builder: &'a mut MetadataBuilder) -> Self { + ParentState::Variant { + saved_buffer_offset: buffer.offset(), + saved_metadata_builder_dict_size: metadata_builder.num_field_names(), + buffer, + metadata_builder, + finished: false, + } + } + + fn list( + buffer: &'a mut ValueBuffer, + metadata_builder: &'a mut MetadataBuilder, + offsets: &'a mut Vec<usize>, + saved_parent_buffer_offset: usize, + ) -> Self { + // The saved_parent_buffer_offset is the buffer size as of when the parent builder was + // constructed. The saved_buffer_offset is the buffer size as of now (when a child builder + // is created). The variant field_offset entry for this list element is their difference. + let saved_buffer_offset = buffer.offset(); + let saved_offsets_size = offsets.len(); + offsets.push(saved_buffer_offset - saved_parent_buffer_offset); + + ParentState::List { + saved_metadata_builder_dict_size: metadata_builder.num_field_names(), + saved_buffer_offset, + saved_offsets_size, + metadata_builder, + buffer, + offsets, + finished: false, } } + fn object( Review Comment: nit: could call this `try_object` to reflect it can return `Result` ########## parquet-variant/src/builder.rs: ########## @@ -589,72 +582,190 @@ impl<S: AsRef<str>> Extend<S> for MetadataBuilder { /// parent, and we cannot "split" a mutable reference across two objects (parent state and the child /// builder that uses it). So everything has to be here. Rust layout optimizations should treat the /// variants as a union, so that accessing a `buffer` or `metadata_builder` is branch-free. +#[derive(Debug)] enum ParentState<'a> { Variant { buffer: &'a mut ValueBuffer, + saved_buffer_offset: usize, metadata_builder: &'a mut MetadataBuilder, + saved_metadata_builder_dict_size: usize, + finished: bool, }, List { buffer: &'a mut ValueBuffer, + saved_buffer_offset: usize, metadata_builder: &'a mut MetadataBuilder, - parent_value_offset_base: usize, + saved_metadata_builder_dict_size: usize, offsets: &'a mut Vec<usize>, + saved_offsets_size: usize, + finished: bool, }, Object { buffer: &'a mut ValueBuffer, + saved_buffer_offset: usize, metadata_builder: &'a mut MetadataBuilder, + saved_metadata_builder_dict_size: usize, fields: &'a mut IndexMap<u32, usize>, - field_name: &'a str, - parent_value_offset_base: usize, + saved_fields_size: usize, + finished: bool, }, } -impl ParentState<'_> { - fn buffer(&mut self) -> &mut ValueBuffer { - match self { - ParentState::Variant { buffer, .. } => buffer, - ParentState::List { buffer, .. } => buffer, - ParentState::Object { buffer, .. } => buffer, +impl<'a> ParentState<'a> { + fn variant(buffer: &'a mut ValueBuffer, metadata_builder: &'a mut MetadataBuilder) -> Self { + ParentState::Variant { + saved_buffer_offset: buffer.offset(), + saved_metadata_builder_dict_size: metadata_builder.num_field_names(), + buffer, + metadata_builder, + finished: false, + } + } + + fn list( + buffer: &'a mut ValueBuffer, + metadata_builder: &'a mut MetadataBuilder, + offsets: &'a mut Vec<usize>, + saved_parent_buffer_offset: usize, + ) -> Self { + // The saved_parent_buffer_offset is the buffer size as of when the parent builder was + // constructed. The saved_buffer_offset is the buffer size as of now (when a child builder + // is created). The variant field_offset entry for this list element is their difference. + let saved_buffer_offset = buffer.offset(); + let saved_offsets_size = offsets.len(); + offsets.push(saved_buffer_offset - saved_parent_buffer_offset); + + ParentState::List { + saved_metadata_builder_dict_size: metadata_builder.num_field_names(), + saved_buffer_offset, + saved_offsets_size, + metadata_builder, + buffer, + offsets, + finished: false, } } + fn object( + buffer: &'a mut ValueBuffer, + metadata_builder: &'a mut MetadataBuilder, + fields: &'a mut IndexMap<u32, usize>, + saved_parent_buffer_offset: usize, + field_name: &str, + validate_unique_fields: bool, + ) -> Result<Self, ArrowError> { + // The saved_parent_buffer_offset is the buffer size as of when the parent builder was + // constructed. The saved_buffer_offset is the buffer size as of now (when a child builder + // is created). The variant field_offset entry for this field is their difference. + let saved_buffer_offset = buffer.offset(); + let saved_fields_size = fields.len(); + let saved_metadata_builder_dict_size = metadata_builder.num_field_names(); + let field_id = metadata_builder.upsert_field_name(field_name); + let field_start = saved_buffer_offset - saved_parent_buffer_offset; + if fields.insert(field_id, field_start).is_some() && validate_unique_fields { + return Err(ArrowError::InvalidArgumentError(format!( + "Duplicate field name: {field_name}" + ))); + } + + Ok(ParentState::Object { + saved_metadata_builder_dict_size, + saved_buffer_offset, + saved_fields_size, + buffer, + metadata_builder, + fields, + finished: false, + }) + } + + fn buffer(&mut self) -> &mut ValueBuffer { + self.buffer_and_metadata_builder().0 + } + fn metadata_builder(&mut self) -> &mut MetadataBuilder { + self.buffer_and_metadata_builder().1 + } + + fn saved_buffer_offset(&mut self) -> usize { match self { ParentState::Variant { - metadata_builder, .. - } => metadata_builder, - ParentState::List { - metadata_builder, .. - } => metadata_builder, - ParentState::Object { - metadata_builder, .. - } => metadata_builder, + saved_buffer_offset, + .. + } + | ParentState::List { + saved_buffer_offset, + .. + } + | ParentState::Object { + saved_buffer_offset, + .. + } => *saved_buffer_offset, } } - // Performs any parent-specific aspects of finishing, after the child has appended all necessary - // bytes to the parent's value buffer. ListBuilder records the new value's starting offset; - // ObjectBuilder associates the new value's starting offset with its field id; VariantBuilder - // doesn't need anything special. - fn finish(&mut self, starting_offset: usize) { + fn is_finished(&mut self) -> &mut bool { + match self { + ParentState::Variant { finished, .. } + | ParentState::List { finished, .. } + | ParentState::Object { finished, .. } => finished, + } + } + + // Mark the insertion as having succeeded. + fn finish(&mut self) { + *self.is_finished() = true + } + + // Performs any parent-specific aspects of rolling back a builder if an insertion failed. + fn rollback(&mut self) { + if *self.is_finished() { + return; + } + + // All builders need to revert the buffers + match self { + ParentState::Variant { + buffer, + saved_buffer_offset, + metadata_builder, + saved_metadata_builder_dict_size, + .. + } + | ParentState::List { + buffer, + saved_buffer_offset, + metadata_builder, + saved_metadata_builder_dict_size, + .. + } + | ParentState::Object { + buffer, + saved_buffer_offset, + metadata_builder, + saved_metadata_builder_dict_size, + .. + } => { + buffer.inner_mut().truncate(*saved_buffer_offset); Review Comment: this is very nice -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org