This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new f7f8f95368 [Variant] Add `VariantBuilder` values check (#10016)
f7f8f95368 is described below
commit f7f8f953686add2c4979ac88eb3db884d92504ea
Author: Konstantin Tarasov <[email protected]>
AuthorDate: Mon Jun 22 14:15:37 2026 -0400
[Variant] Add `VariantBuilder` values check (#10016)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes #7870.
# Rationale for this change
- (Single) `VariantBuilder` accepts 0/1+ values on finish.
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
- Add only 1 value validation for `VariantBuilder`
- Move `try_new_list/try_new_object` to `VariantBuilder`
- Replaced `VariantBuilder` with `WritableMetadataBuilder` in tests that
only build metadata.
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
# Are these changes tested?
- Yes, existing unit tests pass.
- Ran benchmarks locally and added some `[#inline]`
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
If this PR claims a performance improvement, please include evidence
such as benchmark results.
-->
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---
parquet-variant/src/builder.rs | 236 ++++++++++++++++++++++++++++---
parquet-variant/src/builder/metadata.rs | 44 +-----
parquet-variant/src/builder/object.rs | 15 +-
parquet-variant/tests/variant_interop.rs | 11 +-
4 files changed, 239 insertions(+), 67 deletions(-)
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index e6122f062c..b304dd8102 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -52,6 +52,31 @@ pub(crate) fn int_size(v: usize) -> OffsetSizeBytes {
}
}
+const ONE_TOP_LEVEL_VALUE_MSG: &str =
+ "VariantBuilder already contains a top-level variant value; only one is
allowed";
+const EMPTY_BUILDER_MSG: &str =
+ "VariantBuilder is empty; append a top-level value before calling
finish()";
+
+// Error/panic construction is kept out-of-line and cold so the per-call
guards on the
+// builder hot paths stay small enough to inline as a single predictable
branch.
+#[cold]
+#[inline(never)]
+fn top_level_value_panic() -> ! {
+ panic!("{ONE_TOP_LEVEL_VALUE_MSG}");
+}
+
+#[cold]
+#[inline(never)]
+fn top_level_value_error() -> ArrowError {
+ ArrowError::InvalidArgumentError(ONE_TOP_LEVEL_VALUE_MSG.into())
+}
+
+#[cold]
+#[inline(never)]
+fn empty_builder_error() -> ArrowError {
+ ArrowError::InvalidArgumentError(EMPTY_BUILDER_MSG.into())
+}
+
/// Wrapper around a `Vec<u8>` that provides methods for appending
/// primitive values, variant types, and metadata.
///
@@ -792,30 +817,85 @@ impl VariantBuilder {
self.metadata_builder.upsert_field_name(field_name);
}
+ /// Returns true if a top-level variant value has already been written to
this builder.
+ ///
+ /// A [`VariantBuilder`] holds exactly one top-level variant value. Any
committed top-level
+ /// value leaves bytes in the value buffer; a child builder dropped
without `finish()` has
+ /// its bytes rolled back by [`ParentState`], so the offset is a faithful
indicator.
+ #[inline]
+ fn has_top_level_value(&self) -> bool {
+ self.value_builder.offset() != 0
+ }
+
+ #[inline]
+ fn ensure_no_top_level_value(&self) {
+ if self.has_top_level_value() {
+ top_level_value_panic();
+ }
+ }
+
+ #[inline]
+ fn check_no_top_level_value(&self) -> Result<(), ArrowError> {
+ if self.has_top_level_value() {
+ return Err(top_level_value_error());
+ }
+ Ok(())
+ }
+
/// Create an [`ListBuilder`] for creating [`Variant::List`] values.
///
/// See the examples on [`VariantBuilder`] for usage.
+ ///
+ /// # Panics
+ ///
+ /// Panics if a top-level variant value has already been written to this
builder. For a
+ /// fallible version, use [`VariantBuilder::try_new_list`].
pub fn new_list(&mut self) -> ListBuilder<'_, ()> {
+ self.try_new_list().unwrap()
+ }
+
+ /// Create an [`ListBuilder`] for creating [`Variant::List`] values.
+ ///
+ /// Returns an error if a top-level variant value has already been written
to this builder.
+ pub fn try_new_list(&mut self) -> Result<ListBuilder<'_, ()>, ArrowError> {
+ self.check_no_top_level_value()?;
let parent_state =
ParentState::variant(&mut self.value_builder, &mut
self.metadata_builder);
- ListBuilder::new(parent_state, self.validate_unique_fields)
+ Ok(ListBuilder::new(parent_state, self.validate_unique_fields))
}
/// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values.
///
/// See the examples on [`VariantBuilder`] for usage.
+ ///
+ /// # Panics
+ ///
+ /// Panics if a top-level variant value has already been written to this
builder. For a
+ /// fallible version, use [`VariantBuilder::try_new_object`].
pub fn new_object(&mut self) -> ObjectBuilder<'_, ()> {
+ self.try_new_object().unwrap()
+ }
+
+ /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values.
+ ///
+ /// Returns an error if a top-level variant value has already been written
to this builder.
+ pub fn try_new_object(&mut self) -> Result<ObjectBuilder<'_, ()>,
ArrowError> {
+ self.check_no_top_level_value()?;
let parent_state =
ParentState::variant(&mut self.value_builder, &mut
self.metadata_builder);
- ObjectBuilder::new(parent_state, self.validate_unique_fields)
+ Ok(ObjectBuilder::new(
+ parent_state,
+ self.validate_unique_fields,
+ ))
}
/// Append a value to the builder.
///
/// # Panics
///
- /// This method will panic if the variant contains duplicate field names
in objects
- /// when validation is enabled. For a fallible version, use
[`VariantBuilder::try_append_value`]
+ /// Panics if a top-level variant value has already been written to this
builder, or if the
+ /// variant contains duplicate field names in objects when validation is
enabled. For a
+ /// fallible version, use [`VariantBuilder::try_append_value`].
///
/// # Example
/// ```
@@ -825,15 +905,19 @@ impl VariantBuilder {
/// builder.append_value(42i8);
/// ```
pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T)
{
+ self.ensure_no_top_level_value();
let state = ParentState::variant(&mut self.value_builder, &mut
self.metadata_builder);
ValueBuilder::append_variant(state, value.into())
}
/// Append a value to the builder.
+ ///
+ /// Returns an error if a top-level variant value has already been written
to this builder.
pub fn try_append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(
&mut self,
value: T,
) -> Result<(), ArrowError> {
+ self.check_no_top_level_value()?;
let state = ParentState::variant(&mut self.value_builder, &mut
self.metadata_builder);
ValueBuilder::try_append_variant(state, value.into())
}
@@ -846,18 +930,51 @@ impl VariantBuilder {
///
/// The caller must ensure that the metadata dictionary entries are
already built and correct for
/// any objects or lists being appended.
+ ///
+ /// # Panics
+ ///
+ /// Panics if a top-level variant value has already been written to this
builder. For a
+ /// fallible version, use [`VariantBuilder::try_append_value_bytes`].
pub fn append_value_bytes<'m, 'd>(&mut self, value: impl Into<Variant<'m,
'd>>) {
+ self.try_append_value_bytes(value).unwrap()
+ }
+
+ /// Tries to append a variant value to the builder by copying raw bytes
when possible.
+ ///
+ /// This is the fallible version of
[`VariantBuilder::append_value_bytes`]. Returns an error
+ /// if a top-level variant value has already been written to this builder.
+ pub fn try_append_value_bytes<'m, 'd>(
+ &mut self,
+ value: impl Into<Variant<'m, 'd>>,
+ ) -> Result<(), ArrowError> {
+ self.check_no_top_level_value()?;
let state = ParentState::variant(&mut self.value_builder, &mut
self.metadata_builder);
ValueBuilder::append_variant_bytes(state, value.into());
+ Ok(())
+ }
+
+ /// Finish the builder and return the metadata and value buffers.
+ ///
+ /// # Panics
+ ///
+ /// Panics if no top-level variant value has been appended. For a fallible
version, use
+ /// [`VariantBuilder::try_finish`].
+ pub fn finish(self) -> (Vec<u8>, Vec<u8>) {
+ self.try_finish().expect(EMPTY_BUILDER_MSG)
}
/// Finish the builder and return the metadata and value buffers.
- pub fn finish(mut self) -> (Vec<u8>, Vec<u8>) {
+ ///
+ /// Returns an error if no top-level variant value has been appended.
+ pub fn try_finish(mut self) -> Result<(Vec<u8>, Vec<u8>), ArrowError> {
+ if !self.has_top_level_value() {
+ return Err(empty_builder_error());
+ }
self.metadata_builder.finish();
- (
+ Ok((
self.metadata_builder.into_inner(),
self.value_builder.into_inner(),
- )
+ ))
}
}
@@ -915,11 +1032,11 @@ impl VariantBuilderExt for VariantBuilder {
}
fn try_new_list(&mut self) -> Result<ListBuilder<'_, Self::State<'_>>,
ArrowError> {
- Ok(self.new_list())
+ self.try_new_list()
}
fn try_new_object(&mut self) -> Result<ObjectBuilder<'_, Self::State<'_>>,
ArrowError> {
- Ok(self.new_object())
+ self.try_new_object()
}
}
@@ -964,6 +1081,96 @@ mod tests {
assert_eq!(variant, expected);
}
+ #[test]
+ fn test_try_finish_empty_builder_errors() {
+ let builder = VariantBuilder::new();
+ let err = builder.try_finish().unwrap_err();
+ assert!(err.to_string().contains("empty"), "unexpected error: {err}");
+ }
+
+ #[test]
+ #[should_panic(expected = "empty")]
+ fn test_finish_empty_builder_panics() {
+ let builder = VariantBuilder::new();
+ let _ = builder.finish();
+ }
+
+ #[test]
+ fn test_try_append_value_after_value_errors() {
+ let mut builder = VariantBuilder::new();
+ builder.append_value(1i32);
+ let err = builder.try_append_value(2i32).unwrap_err();
+ assert!(
+ err.to_string().contains("only one is allowed"),
+ "unexpected error: {err}"
+ );
+ }
+
+ #[test]
+ fn test_try_append_value_bytes_after_value_errors() {
+ let mut builder = VariantBuilder::new();
+ builder.append_value(1i32);
+ let err = builder.try_append_value_bytes(2i32).unwrap_err();
+ assert!(
+ err.to_string().contains("only one is allowed"),
+ "unexpected error: {err}"
+ );
+ }
+
+ #[test]
+ fn test_try_new_list_after_value_errors() {
+ let mut builder = VariantBuilder::new();
+ builder.append_value(1i32);
+ let err = builder.try_new_list().expect_err("expected error");
+ assert!(
+ err.to_string().contains("only one is allowed"),
+ "unexpected error: {err}"
+ );
+ }
+
+ #[test]
+ fn test_try_new_object_after_value_errors() {
+ let mut builder = VariantBuilder::new();
+ builder.append_value(1i32);
+ let err = builder.try_new_object().expect_err("expected error");
+ assert!(
+ err.to_string().contains("only one is allowed"),
+ "unexpected error: {err}"
+ );
+ }
+
+ #[test]
+ #[should_panic(expected = "only one is allowed")]
+ fn test_append_value_after_value_panics() {
+ let mut builder = VariantBuilder::new();
+ builder.append_value(1i32);
+ builder.append_value(2i32);
+ }
+
+ #[test]
+ #[should_panic(expected = "only one is allowed")]
+ fn test_append_value_bytes_after_value_panics() {
+ let mut builder = VariantBuilder::new();
+ builder.append_value(1i32);
+ builder.append_value_bytes(2i32);
+ }
+
+ #[test]
+ #[should_panic(expected = "only one is allowed")]
+ fn test_new_list_after_value_panics() {
+ let mut builder = VariantBuilder::new();
+ builder.append_value(1i32);
+ let _ = builder.new_list();
+ }
+
+ #[test]
+ #[should_panic(expected = "only one is allowed")]
+ fn test_new_object_after_value_panics() {
+ let mut builder = VariantBuilder::new();
+ builder.append_value(1i32);
+ let _ = builder.new_object();
+ }
+
#[test]
fn test_nested_object_with_lists() {
/*
@@ -1044,14 +1251,9 @@ mod tests {
variant2.add_field_name("a");
assert!(!variant2.metadata_builder.is_sorted);
- // per the spec, make sure the variant will fail to build if only
metadata is provided
- let (m, v) = variant2.finish();
- let res = Variant::try_new(&m, &v);
- assert!(res.is_err());
-
- // since it is not sorted, make sure the metadata says so
- let header = VariantMetadata::try_new(&m).unwrap();
- assert!(!header.is_sorted());
+ // per the spec, a variant must have a top-level value;
try_finish() rejects empty
+ let err = variant2.try_finish().unwrap_err();
+ assert!(err.to_string().contains("empty"), "unexpected error:
{err}");
}
// write out variant1 and make sure the sorted flag is properly encoded
diff --git a/parquet-variant/src/builder/metadata.rs
b/parquet-variant/src/builder/metadata.rs
index efccc2e4c6..10b34ced03 100644
--- a/parquet-variant/src/builder/metadata.rs
+++ b/parquet-variant/src/builder/metadata.rs
@@ -268,7 +268,7 @@ impl<S: AsRef<str>> Extend<S> for WritableMetadataBuilder {
#[cfg(test)]
mod test {
use crate::{
- ParentState, ValueBuilder, Variant, VariantBuilder, VariantMetadata,
+ ParentState, ValueBuilder, VariantMetadata,
builder::{
metadata::{ReadOnlyMetadataBuilder, WritableMetadataBuilder},
object::ObjectBuilder,
@@ -356,47 +356,13 @@ mod test {
assert_eq!(metadata.num_field_names(), 3);
}
- #[test]
- fn test_read_only_metadata_builder() {
- // First create some metadata with a few field names
- let mut default_builder = VariantBuilder::new();
- default_builder.add_field_name("name");
- default_builder.add_field_name("age");
- default_builder.add_field_name("active");
- let (metadata_bytes, _) = default_builder.finish();
-
- // Use the metadata to build new variant values
- let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
- let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata);
- let mut value_builder = ValueBuilder::new();
-
- {
- let state = ParentState::variant(&mut value_builder, &mut
metadata_builder);
- let mut obj = ObjectBuilder::new(state, false);
-
- // These should succeed because the fields exist in the metadata
- obj.insert("name", "Alice");
- obj.insert("age", 30i8);
- obj.insert("active", true);
- obj.finish();
- }
-
- let value = value_builder.into_inner();
-
- // Verify the variant was built correctly
- let variant = Variant::try_new(&metadata_bytes, &value).unwrap();
- let obj = variant.as_object().unwrap();
- assert_eq!(obj.get("name"), Some(Variant::from("Alice")));
- assert_eq!(obj.get("age"), Some(Variant::Int8(30)));
- assert_eq!(obj.get("active"), Some(Variant::from(true)));
- }
-
#[test]
fn test_read_only_metadata_builder_fails_on_unknown_field() {
// Create metadata with only one field
- let mut default_builder = VariantBuilder::new();
- default_builder.add_field_name("known_field");
- let (metadata_bytes, _) = default_builder.finish();
+ let mut default_builder = WritableMetadataBuilder::default();
+ default_builder.upsert_field_name("known_field");
+ default_builder.finish();
+ let metadata_bytes = default_builder.into_inner();
// Use the metadata to build new variant values
let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
diff --git a/parquet-variant/src/builder/object.rs
b/parquet-variant/src/builder/object.rs
index 876c2e2d4c..670e3f7d76 100644
--- a/parquet-variant/src/builder/object.rs
+++ b/parquet-variant/src/builder/object.rs
@@ -431,6 +431,7 @@ impl<S: BuilderSpecificState> VariantBuilderExt for
ObjectFieldBuilder<'_, '_, '
mod tests {
use crate::{
ParentState, ValueBuilder, Variant, VariantBuilder, VariantMetadata,
+ WritableMetadataBuilder,
builder::{metadata::ReadOnlyMetadataBuilder, object::ObjectBuilder},
decoder::VariantBasicType,
};
@@ -501,11 +502,12 @@ mod tests {
#[test]
fn test_read_only_metadata_builder() {
// First create some metadata with a few field names
- let mut default_builder = VariantBuilder::new();
- default_builder.add_field_name("name");
- default_builder.add_field_name("age");
- default_builder.add_field_name("active");
- let (metadata_bytes, _) = default_builder.finish();
+ let mut default_builder = WritableMetadataBuilder::default();
+ default_builder.upsert_field_name("name");
+ default_builder.upsert_field_name("age");
+ default_builder.upsert_field_name("active");
+ default_builder.finish();
+ let metadata_bytes = default_builder.into_inner();
// Use the metadata to build new variant values
let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
@@ -899,7 +901,8 @@ mod tests {
inner_list.finish();
outer_list.finish();
- // Valid object should succeed
+ // Valid object should succeed (fresh builder — one top-level value
per VariantBuilder)
+ let mut builder =
VariantBuilder::new().with_validate_unique_fields(true);
let mut list = builder.new_list();
let mut valid_obj = list.new_object();
valid_obj.insert("m", 1);
diff --git a/parquet-variant/tests/variant_interop.rs
b/parquet-variant/tests/variant_interop.rs
index 70b42e1f3c..4c7c10d51a 100644
--- a/parquet-variant/tests/variant_interop.rs
+++ b/parquet-variant/tests/variant_interop.rs
@@ -466,11 +466,12 @@ fn generate_random_value(rng: &mut StdRng, builder: &mut
VariantBuilder, max_dep
)
.unwrap();
- // timestamp w/o timezone
- builder.append_value(data_time.naive_local());
-
- // timestamp with timezone
- builder.append_value(data_time.naive_utc().and_utc());
+ // randomly pick timestamp with or without timezone
+ if rng.random_bool(0.5) {
+ builder.append_value(data_time.naive_local());
+ } else {
+ builder.append_value(data_time.naive_utc().and_utc());
+ }
}
17 => {
builder.append_value(Uuid::new_v4());