alamb commented on code in PR #8167: URL: https://github.com/apache/arrow-rs/pull/8167#discussion_r2285975134
########## parquet-variant-compute/src/variant_array_builder.rs: ########## @@ -136,20 +140,15 @@ impl VariantArrayBuilder { pub fn append_null(&mut self) { self.nulls.append_null(); // The subfields are expected to be non-nullable according to the parquet variant spec. - let metadata_offset = self.metadata_buffer.len(); - let metadata_length = 0; - self.metadata_locations - .push((metadata_offset, metadata_length)); - let value_offset = self.value_buffer.len(); - let value_length = 0; - self.value_locations.push((value_offset, value_length)); + self.metadata_offsets.push(self.metadata_builder.offset()); + self.value_offsets.push(self.value_builder.offset()); Review Comment: Not as part of this PR, but I think the Null elements are supposed to have a `Variant::Null` rather than empty for all elements that are marked as null. ########## parquet-variant/src/builder.rs: ########## @@ -1428,54 +1415,61 @@ impl<'a> ObjectBuilder<'a> { } // Returns validate_unique_fields because we can no longer reference self once this method returns. - fn parent_state<'b>(&'b mut self, key: &'b str) -> (ParentState<'b>, bool) { + fn parent_state<'b>( + &'b mut self, + field_name: &'b str, + ) -> Result<(ParentState<'b>, bool), ArrowError> { + let saved_parent_buffer_offset = self.parent_state.saved_buffer_offset(); let validate_unique_fields = self.validate_unique_fields; - let (buffer, metadata_builder) = self.parent_state.buffer_and_metadata_builder(); - - let state = ParentState::Object { + let state = ParentState::object( buffer, metadata_builder, - fields: &mut self.fields, - field_name: key, - parent_value_offset_base: self.parent_value_offset_base, - }; - (state, validate_unique_fields) + &mut self.fields, + saved_parent_buffer_offset, + field_name, + validate_unique_fields, + )?; + Ok((state, validate_unique_fields)) } /// Returns an object builder that can be used to append a new (nested) object to this object. /// + /// Panics if the requested key cannot be inserted (e.g. because it is a duplicate). Review Comment: Another reason it could panic is that the builder has read only metadata, right? ########## parquet-variant/src/builder.rs: ########## @@ -86,28 +87,16 @@ fn append_packed_u32(dest: &mut Vec<u8>, value: u32, value_size: usize) { /// /// You can reuse an existing `Vec<u8>` by using the `from` impl #[derive(Debug, Default)] -struct ValueBuffer(Vec<u8>); +pub struct ValueBuilder(Vec<u8>); -impl ValueBuffer { +impl ValueBuilder { /// Construct a ValueBuffer that will write to a new underlying `Vec` - fn new() -> Self { + pub fn new() -> Self { Default::default() } } -impl From<Vec<u8>> for ValueBuffer { - fn from(value: Vec<u8>) -> Self { - Self(value) - } -} - -impl From<ValueBuffer> for Vec<u8> { - fn from(value_buffer: ValueBuffer) -> Self { - value_buffer.0 - } -} Review Comment: if it isn't needed I support removing it (we can always add it back later) ########## parquet-variant/src/variant/metadata.rs: ########## @@ -331,6 +331,11 @@ impl<'m> VariantMetadata<'m> { self.iter_try() .map(|result| result.expect("Invalid metadata dictionary entry")) } + + /// Same as `Index::index`, but with the correct lifetime. + pub(crate) fn get_infallible(&self, i: usize) -> &'m str { Review Comment: I don't understand this comment. Minor nit: directly using `get(i).unwrap()` at relevant locations rather than add a new function might be easier to understand than understanding when to use `get_infallible` 🤔 ########## parquet-variant/src/builder.rs: ########## @@ -234,8 +223,8 @@ impl ValueBuffer { self.append_slice(value.as_bytes()); } - fn append_object(&mut self, metadata_builder: &mut MetadataBuilder, obj: VariantObject) { - let mut object_builder = self.new_object(metadata_builder); + fn append_object(state: ParentState<'_>, obj: VariantObject) { + let mut object_builder = ObjectBuilder::new(state, false); Review Comment: I like this idea -- it makes a lot of sense to me ########## parquet-variant/src/builder.rs: ########## @@ -1428,54 +1415,61 @@ impl<'a> ObjectBuilder<'a> { } // Returns validate_unique_fields because we can no longer reference self once this method returns. - fn parent_state<'b>(&'b mut self, key: &'b str) -> (ParentState<'b>, bool) { + fn parent_state<'b>( + &'b mut self, + field_name: &'b str, + ) -> Result<(ParentState<'b>, bool), ArrowError> { + let saved_parent_buffer_offset = self.parent_state.saved_buffer_offset(); let validate_unique_fields = self.validate_unique_fields; - let (buffer, metadata_builder) = self.parent_state.buffer_and_metadata_builder(); - - let state = ParentState::Object { + let state = ParentState::object( buffer, metadata_builder, - fields: &mut self.fields, - field_name: key, - parent_value_offset_base: self.parent_value_offset_base, - }; - (state, validate_unique_fields) + &mut self.fields, + saved_parent_buffer_offset, + field_name, + validate_unique_fields, + )?; + Ok((state, validate_unique_fields)) } /// Returns an object builder that can be used to append a new (nested) object to this object. /// + /// Panics if the requested key cannot be inserted (e.g. because it is a duplicate). Review Comment: Another way we could avoid panic's would be to defer errors until the final `finish()` is called 🤔 I think this is fine for now ########## parquet-variant-json/src/from_json.rs: ########## @@ -630,10 +630,10 @@ mod test { // Verify metadata size = 1 + 2 + 2 * 497 + 3 * 496 assert_eq!(metadata.len(), 2485); // Verify value size. - // Size of innermost_list: 1 + 1 + 258 + 256 = 516 - // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128 - // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313 - assert_eq!(value.len(), 34082313); + // Size of innermost_list: 1 + 1 + 2*(128 + 1) + 2*128 = 516 + // Size of inner object: 1 + 4 + 2*256 + 3*(256 + 1) + 256 * 516 = 133384 + // Size of json: 1 + 4 + 2*256 + 4*(256 + 1) + 256 * 133384 = 34147849 + assert_eq!(value.len(), 34147849); Review Comment: I agree with this analysis -- the test seems overly sensitive to small pturbatons ########## parquet-variant/src/builder.rs: ########## @@ -1575,22 +1536,30 @@ impl Drop for ObjectBuilder<'_> { pub trait VariantBuilderExt { fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>); - fn new_list(&mut self) -> ListBuilder<'_>; + fn new_list(&mut self) -> ListBuilder<'_> { + self.try_new_list().unwrap() + } + + fn new_object(&mut self) -> ObjectBuilder<'_> { + self.try_new_object().unwrap() + } - fn new_object(&mut self) -> ObjectBuilder<'_>; + fn try_new_list(&mut self) -> Result<ListBuilder<'_>, ArrowError>; + + fn try_new_object(&mut self) -> Result<ObjectBuilder<'_>, ArrowError>; Review Comment: Yeah, me too -- it is pretty annoying to have to check for errors on every single call to creating these objects, even when it can not fail (e.g. creating in memory variants) Another thought I had was we could potentially defer errors that could arise into the call to `build()` -- but then that might results in hard to track down errors 🤔 ########## parquet-variant/src/builder.rs: ########## @@ -437,13 +397,76 @@ impl ValueBuffer { } } +pub trait MetadataBuilder: std::fmt::Debug { + fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError>; + fn field_name(&self, field_id: usize) -> &str; + fn num_field_names(&self) -> usize; + fn truncate_field_names(&mut self, new_size: usize); +} Review Comment: The API makes sense to me -- I think it would be nice to add some comments, but in general 👍 ########## parquet-variant/src/builder.rs: ########## @@ -1299,61 +1330,26 @@ impl<'a> ListBuilder<'a> { .inner_mut() .splice(starting_offset..starting_offset, bytes_to_splice); - self.parent_state.finish(starting_offset); - self.has_been_finished = true; - } -} - -/// Drop implementation for ListBuilder does nothing -/// as the `finish` method must be called to finalize the list. -/// This is to ensure that the list is always finalized before its parent builder -/// is finalized. -impl Drop for ListBuilder<'_> { - fn drop(&mut self) { - if !self.has_been_finished { - self.parent_state - .buffer() - .inner_mut() - .truncate(self.parent_value_offset_base); - self.parent_state - .metadata_builder() - .field_names - .truncate(self.parent_metadata_offset_base); - } + self.parent_state.finish(); } } /// A builder for creating [`Variant::Object`] values. /// /// See the examples on [`VariantBuilder`] for usage. +#[derive(Debug)] pub struct ObjectBuilder<'a> { parent_state: ParentState<'a>, fields: IndexMap<u32, usize>, // (field_id, offset) - /// The starting offset in the parent's buffer where this object starts - parent_value_offset_base: usize, - /// The starting offset in the parent's metadata buffer where this object starts - /// used to truncate the written fields in `drop` if the current object has not been finished - parent_metadata_offset_base: usize, - /// Whether the object has been finished, the written content of the current object - /// will be truncated in `drop` if `has_been_finished` is false - has_been_finished: bool, Review Comment: it is very nice that the parent state handles this nicely now ########## parquet-variant/src/builder.rs: ########## @@ -437,13 +397,76 @@ impl ValueBuffer { } } +pub trait MetadataBuilder: std::fmt::Debug { + fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError>; + fn field_name(&self, field_id: usize) -> &str; + fn num_field_names(&self) -> usize; + fn truncate_field_names(&mut self, new_size: usize); +} + +impl MetadataBuilder for MetadataBuilderXX { + fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError> { + Ok(self.upsert_field_name(field_name)) + } + fn field_name(&self, field_id: usize) -> &str { + self.field_name(field_id) + } + fn num_field_names(&self) -> usize { + self.num_field_names() + } + fn truncate_field_names(&mut self, new_size: usize) { + self.field_names.truncate(new_size) + } +} + +#[derive(Debug)] +pub struct ReadOnlyMetadataBuilder<'m> { + metadata: VariantMetadata<'m>, + known_field_names: HashMap<&'m str, usize>, +} + +impl<'m> ReadOnlyMetadataBuilder<'m> { + pub fn new(metadata: VariantMetadata<'m>) -> Self { + Self { + metadata, + known_field_names: HashMap::new(), + } + } +} + +impl MetadataBuilder for ReadOnlyMetadataBuilder<'_> { + fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError> { + if let Some(field_id) = self.known_field_names.get(field_name) { + return Ok(*field_id as u32); + } + + // TODO: Be (a lot) smarter here! + let Some(field_id) = self.metadata.iter().position(|name| name == field_name) else { + return Err(ArrowError::InvalidArgumentError(format!( + "Field name '{field_name}' not found in metadata", + ))); + }; + self.known_field_names.insert(self.metadata.get_infallible(field_id), field_id); + Ok(field_id as u32) + } + fn field_name(&self, field_id: usize) -> &str { + &self.metadata[field_id] + } + fn num_field_names(&self) -> usize { + self.metadata.len() + } + fn truncate_field_names(&mut self, new_size: usize) { + debug_assert_eq!(self.metadata.len(), new_size); + } +} + /// Builder for constructing metadata for [`Variant`] values. /// /// This is used internally by the [`VariantBuilder`] to construct the metadata /// /// You can use an existing `Vec<u8>` as the metadata buffer by using the `from` impl. #[derive(Default, Debug)] -struct MetadataBuilder { +pub struct MetadataBuilderXX { Review Comment: How about `OwnedMetadataBuilder` or `DefaultMetadataBuilder` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org