This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 2405798912 [Variant] Add flag in `ObjectBuilder` to control validation
behavior on duplicate field write (#7801)
2405798912 is described below
commit 24057989124de2ef0867035156c6dc8dabed6a49
Author: Michael Renda <[email protected]>
AuthorDate: Tue Jul 1 11:18:33 2025 -0400
[Variant] Add flag in `ObjectBuilder` to control validation behavior on
duplicate field write (#7801)
# Which issue does this PR close?
Closes #7777.
# What changes are included in this PR?
Added method `with_validate_unique_fields` that modifies an
`ObjectBuilder` to throw an error in `finish` if duplicate fields were
inserted. The error message thrown displays all of the duplicate keys
that were inserted to the object.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet-variant/benches/variant_builder.rs | 22 ++--
parquet-variant/src/builder.rs | 186 +++++++++++++++++++++++++----
parquet-variant/src/to_json.rs | 10 +-
parquet-variant/tests/variant_interop.rs | 2 +-
4 files changed, 180 insertions(+), 40 deletions(-)
diff --git a/parquet-variant/benches/variant_builder.rs
b/parquet-variant/benches/variant_builder.rs
index 432c4192e3..f69e3170c6 100644
--- a/parquet-variant/benches/variant_builder.rs
+++ b/parquet-variant/benches/variant_builder.rs
@@ -77,7 +77,7 @@ fn bench_object_field_names_reverse_order(c: &mut Criterion) {
object_builder.insert(format!("{}", 1000 - i).as_str(),
string_table.next());
}
- object_builder.finish();
+ object_builder.finish().unwrap();
hint::black_box(variant.finish());
})
});
@@ -113,7 +113,7 @@ fn bench_object_same_schema(c: &mut Criterion) {
inner_list_builder.append_value(string_table.next());
inner_list_builder.finish();
- object_builder.finish();
+ object_builder.finish().unwrap();
hint::black_box(variant.finish());
}
@@ -154,7 +154,7 @@ fn bench_object_list_same_schema(c: &mut Criterion) {
list_builder.append_value(string_table.next());
list_builder.finish();
- object_builder.finish();
+ object_builder.finish().unwrap();
}
list_builder.finish();
@@ -189,7 +189,7 @@ fn bench_object_unknown_schema(c: &mut Criterion) {
let key = string_table.next();
inner_object_builder.insert(key, key);
}
- inner_object_builder.finish();
+ inner_object_builder.finish().unwrap();
continue;
}
@@ -202,7 +202,7 @@ fn bench_object_unknown_schema(c: &mut Criterion) {
inner_list_builder.finish();
}
- object_builder.finish();
+ object_builder.finish().unwrap();
hint::black_box(variant.finish());
}
})
@@ -241,7 +241,7 @@ fn bench_object_list_unknown_schema(c: &mut Criterion) {
let key = string_table.next();
inner_object_builder.insert(key, key);
}
- inner_object_builder.finish();
+ inner_object_builder.finish().unwrap();
continue;
}
@@ -254,7 +254,7 @@ fn bench_object_list_unknown_schema(c: &mut Criterion) {
inner_list_builder.finish();
}
- object_builder.finish();
+ object_builder.finish().unwrap();
}
list_builder.finish();
@@ -314,10 +314,10 @@ fn bench_object_partially_same_schema(c: &mut Criterion) {
let key = string_table.next();
inner_object_builder.insert(key, key);
}
- inner_object_builder.finish();
+ inner_object_builder.finish().unwrap();
}
- object_builder.finish();
+ object_builder.finish().unwrap();
hint::black_box(variant.finish());
}
})
@@ -376,10 +376,10 @@ fn bench_object_list_partially_same_schema(c: &mut
Criterion) {
let key = string_table.next();
inner_object_builder.insert(key, key);
}
- inner_object_builder.finish();
+ inner_object_builder.finish().unwrap();
}
- object_builder.finish();
+ object_builder.finish().unwrap();
}
list_builder.finish();
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index f0f3237147..3a8f7af6a0 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -16,7 +16,9 @@
// under the License.
use crate::decoder::{VariantBasicType, VariantPrimitiveType};
use crate::{ShortString, Variant, VariantDecimal16, VariantDecimal4,
VariantDecimal8};
+use arrow_schema::ArrowError;
use indexmap::{IndexMap, IndexSet};
+use std::collections::HashSet;
const BASIC_TYPE_BITS: u8 = 2;
const UNIX_EPOCH_DATE: chrono::NaiveDate =
chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
@@ -435,10 +437,27 @@ impl MetadataBuilder {
/// );
///
/// ```
+/// # Example: Unique Field Validation
+///
+/// This example shows how enabling unique field validation will cause an error
+/// if the same field is inserted more than once.
+/// ```
+/// use parquet_variant::VariantBuilder;
+///
+/// let mut builder = VariantBuilder::new().with_validate_unique_fields(true);
+/// let mut obj = builder.new_object();
+///
+/// obj.insert("a", 1);
+/// obj.insert("a", 2); // duplicate field
+///
+/// let result = obj.finish(); // returns Err
+/// assert!(result.is_err());
+/// ```
#[derive(Default)]
pub struct VariantBuilder {
buffer: ValueBuffer,
metadata_builder: MetadataBuilder,
+ validate_unique_fields: bool,
}
impl VariantBuilder {
@@ -446,14 +465,26 @@ impl VariantBuilder {
Self {
buffer: ValueBuffer::default(),
metadata_builder: MetadataBuilder::default(),
+ validate_unique_fields: false,
}
}
+ /// Enables validation of unique field keys in nested objects.
+ ///
+ /// This setting is propagated to all [`ObjectBuilder`]s created through
this [`VariantBuilder`]
+ /// (including via any [`ListBuilder`]), and causes
[`ObjectBuilder::finish()`] to return
+ /// an error if duplicate keys were inserted.
+ pub fn with_validate_unique_fields(mut self, validate_unique_fields: bool)
-> Self {
+ self.validate_unique_fields = validate_unique_fields;
+ self
+ }
+
/// Create an [`ListBuilder`] for creating [`Variant::List`] values.
///
/// See the examples on [`VariantBuilder`] for usage.
pub fn new_list(&mut self) -> ListBuilder {
ListBuilder::new(&mut self.buffer, &mut self.metadata_builder)
+ .with_validate_unique_fields(self.validate_unique_fields)
}
/// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values.
@@ -461,6 +492,7 @@ impl VariantBuilder {
/// See the examples on [`VariantBuilder`] for usage.
pub fn new_object(&mut self) -> ObjectBuilder {
ObjectBuilder::new(&mut self.buffer, &mut self.metadata_builder)
+ .with_validate_unique_fields(self.validate_unique_fields)
}
pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T)
{
@@ -482,6 +514,7 @@ pub struct ListBuilder<'a> {
buffer: ValueBuffer,
/// Is there a pending nested object or list that needs to be finalized?
pending: bool,
+ validate_unique_fields: bool,
}
impl<'a> ListBuilder<'a> {
@@ -492,6 +525,7 @@ impl<'a> ListBuilder<'a> {
offsets: vec![0],
buffer: ValueBuffer::default(),
pending: false,
+ validate_unique_fields: false,
}
}
@@ -506,10 +540,20 @@ impl<'a> ListBuilder<'a> {
self.pending = false;
}
+ /// Enables unique field key validation for objects created within this
list.
+ ///
+ /// Propagates the validation flag to any [`ObjectBuilder`]s created using
+ /// [`ListBuilder::new_object`].
+ pub fn with_validate_unique_fields(mut self, validate_unique_fields: bool)
-> Self {
+ self.validate_unique_fields = validate_unique_fields;
+ self
+ }
+
pub fn new_object(&mut self) -> ObjectBuilder {
self.check_new_offset();
- let obj_builder = ObjectBuilder::new(&mut self.buffer,
self.metadata_builder);
+ let obj_builder = ObjectBuilder::new(&mut self.buffer,
self.metadata_builder)
+ .with_validate_unique_fields(self.validate_unique_fields);
self.pending = true;
obj_builder
@@ -518,7 +562,8 @@ impl<'a> ListBuilder<'a> {
pub fn new_list(&mut self) -> ListBuilder {
self.check_new_offset();
- let list_builder = ListBuilder::new(&mut self.buffer,
self.metadata_builder);
+ let list_builder = ListBuilder::new(&mut self.buffer,
self.metadata_builder)
+ .with_validate_unique_fields(self.validate_unique_fields);
self.pending = true;
list_builder
@@ -568,6 +613,9 @@ pub struct ObjectBuilder<'a, 'b> {
buffer: ValueBuffer,
/// Is there a pending list or object that needs to be finalized?
pending: Option<(&'b str, usize)>,
+ validate_unique_fields: bool,
+ /// Set of duplicate fields to report for errors
+ duplicate_fields: HashSet<u32>,
}
impl<'a, 'b> ObjectBuilder<'a, 'b> {
@@ -578,6 +626,8 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
fields: IndexMap::new(),
buffer: ValueBuffer::default(),
pending: None,
+ validate_unique_fields: false,
+ duplicate_fields: HashSet::new(),
}
}
@@ -602,17 +652,30 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
let field_id = self.metadata_builder.upsert_field_name(key);
let field_start = self.buffer.offset();
- self.fields.insert(field_id, field_start);
+ if self.fields.insert(field_id, field_start).is_some() &&
self.validate_unique_fields {
+ self.duplicate_fields.insert(field_id);
+ }
+
self.buffer.append_non_nested_value(value);
}
+ /// Enables validation for unique field keys when inserting into this
object.
+ ///
+ /// When this is enabled, calling [`ObjectBuilder::finish`] will return an
error
+ /// if any duplicate field keys were added using [`ObjectBuilder::insert`].
+ pub fn with_validate_unique_fields(mut self, validate_unique_fields: bool)
-> Self {
+ self.validate_unique_fields = validate_unique_fields;
+ self
+ }
+
/// Return a new [`ObjectBuilder`] to add a nested object with the
specified
/// key to the object.
pub fn new_object(&mut self, key: &'b str) -> ObjectBuilder {
self.check_pending_field();
let field_start = self.buffer.offset();
- let obj_builder = ObjectBuilder::new(&mut self.buffer,
self.metadata_builder);
+ let obj_builder = ObjectBuilder::new(&mut self.buffer,
self.metadata_builder)
+ .with_validate_unique_fields(self.validate_unique_fields);
self.pending = Some((key, field_start));
obj_builder
@@ -624,7 +687,8 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
self.check_pending_field();
let field_start = self.buffer.offset();
- let list_builder = ListBuilder::new(&mut self.buffer,
self.metadata_builder);
+ let list_builder = ListBuilder::new(&mut self.buffer,
self.metadata_builder)
+ .with_validate_unique_fields(self.validate_unique_fields);
self.pending = Some((key, field_start));
list_builder
@@ -633,9 +697,24 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
/// Finalize object
///
/// This consumes self and writes the object to the parent buffer.
- pub fn finish(mut self) {
+ pub fn finish(mut self) -> Result<(), ArrowError> {
self.check_pending_field();
+ if self.validate_unique_fields && !self.duplicate_fields.is_empty() {
+ let mut names = self
+ .duplicate_fields
+ .iter()
+ .map(|id| self.metadata_builder.field_name(*id as usize))
+ .collect::<Vec<_>>();
+
+ names.sort_unstable();
+
+ let joined = names.join(", ");
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Duplicate field keys detected: [{joined}]",
+ )));
+ }
+
let data_size = self.buffer.offset();
let num_fields = self.fields.len();
let is_large = num_fields > u8::MAX as usize;
@@ -672,6 +751,8 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> {
write_offset(self.parent_buffer.inner_mut(), data_size, offset_size);
self.parent_buffer.append_slice(self.buffer.inner());
+
+ Ok(())
}
}
@@ -821,7 +902,7 @@ mod tests {
let mut obj = builder.new_object();
obj.insert("name", "John");
obj.insert("age", 42i8);
- obj.finish();
+ let _ = obj.finish();
}
let (metadata, value) = builder.finish();
@@ -838,7 +919,7 @@ mod tests {
obj.insert("zebra", "stripes"); // ID = 0
obj.insert("apple", "red"); // ID = 1
obj.insert("banana", "yellow"); // ID = 2
- obj.finish();
+ let _ = obj.finish();
}
let (_, value) = builder.finish();
@@ -862,7 +943,7 @@ mod tests {
let mut object_builder = builder.new_object();
object_builder.insert("name", "Ron Artest");
object_builder.insert("name", "Metta World Peace");
- object_builder.finish();
+ let _ = object_builder.finish();
let (metadata, value) = builder.finish();
let variant = Variant::try_new(&metadata, &value).unwrap();
@@ -983,14 +1064,14 @@ mod tests {
let mut object_builder = list_builder.new_object();
object_builder.insert("id", 1);
object_builder.insert("type", "Cauliflower");
- object_builder.finish();
+ let _ = object_builder.finish();
}
{
let mut object_builder = list_builder.new_object();
object_builder.insert("id", 2);
object_builder.insert("type", "Beets");
- object_builder.finish();
+ let _ = object_builder.finish();
}
list_builder.finish();
@@ -1031,13 +1112,13 @@ mod tests {
{
let mut object_builder = list_builder.new_object();
object_builder.insert("a", 1);
- object_builder.finish();
+ let _ = object_builder.finish();
}
{
let mut object_builder = list_builder.new_object();
object_builder.insert("b", 2);
- object_builder.finish();
+ let _ = object_builder.finish();
}
list_builder.finish();
@@ -1084,7 +1165,7 @@ mod tests {
{
let mut object_builder = list_builder.new_object();
object_builder.insert("a", 1);
- object_builder.finish();
+ let _ = object_builder.finish();
}
list_builder.append_value(2);
@@ -1092,7 +1173,7 @@ mod tests {
{
let mut object_builder = list_builder.new_object();
object_builder.insert("b", 2);
- object_builder.finish();
+ let _ = object_builder.finish();
}
list_builder.append_value(3);
@@ -1142,10 +1223,10 @@ mod tests {
{
let mut inner_object_builder =
outer_object_builder.new_object("c");
inner_object_builder.insert("b", "a");
- inner_object_builder.finish();
+ let _ = inner_object_builder.finish();
}
- outer_object_builder.finish();
+ let _ = outer_object_builder.finish();
}
let (metadata, value) = builder.finish();
@@ -1184,11 +1265,11 @@ mod tests {
inner_object_builder.insert("b", false);
inner_object_builder.insert("c", "a");
- inner_object_builder.finish();
+ let _ = inner_object_builder.finish();
}
outer_object_builder.insert("b", false);
- outer_object_builder.finish();
+ let _ = outer_object_builder.finish();
}
let (metadata, value) = builder.finish();
@@ -1232,10 +1313,10 @@ mod tests {
inner_object_list_builder.finish();
}
- inner_object_builder.finish();
+ let _ = inner_object_builder.finish();
}
- outer_object_builder.finish();
+ let _ = outer_object_builder.finish();
}
let (metadata, value) = builder.finish();
@@ -1280,12 +1361,12 @@ mod tests {
{
let mut inner_object_builder =
outer_object_builder.new_object("c");
inner_object_builder.insert("b", "a");
- inner_object_builder.finish();
+ let _ = inner_object_builder.finish();
}
outer_object_builder.insert("b", true);
- outer_object_builder.finish();
+ let _ = outer_object_builder.finish();
}
let (metadata, value) = builder.finish();
@@ -1321,4 +1402,63 @@ mod tests {
assert_eq!(outer_object.field_name(1).unwrap(), "b");
assert_eq!(outer_object.field(1).unwrap(), Variant::from(true));
}
+
+ #[test]
+ fn test_object_without_unique_field_validation() {
+ let mut builder = VariantBuilder::new();
+
+ // Root object with duplicates
+ let mut obj = builder.new_object();
+ obj.insert("a", 1);
+ obj.insert("a", 2);
+ assert!(obj.finish().is_ok());
+
+ // Deeply nested list structure with duplicates
+ let mut outer_list = builder.new_list();
+ let mut inner_list = outer_list.new_list();
+ let mut nested_obj = inner_list.new_object();
+ nested_obj.insert("x", 1);
+ nested_obj.insert("x", 2);
+ assert!(nested_obj.finish().is_ok());
+ }
+
+ #[test]
+ fn test_object_with_unique_field_validation() {
+ let mut builder =
VariantBuilder::new().with_validate_unique_fields(true);
+
+ // Root-level object with duplicates
+ let mut root_obj = builder.new_object();
+ root_obj.insert("a", 1);
+ root_obj.insert("b", 2);
+ root_obj.insert("a", 3);
+ root_obj.insert("b", 4);
+
+ let result = root_obj.finish();
+ assert_eq!(
+ result.unwrap_err().to_string(),
+ "Invalid argument error: Duplicate field keys detected: [a, b]"
+ );
+
+ // Deeply nested list -> list -> object with duplicate
+ let mut outer_list = builder.new_list();
+ let mut inner_list = outer_list.new_list();
+ let mut nested_obj = inner_list.new_object();
+ nested_obj.insert("x", 1);
+ nested_obj.insert("x", 2);
+
+ let nested_result = nested_obj.finish();
+ assert_eq!(
+ nested_result.unwrap_err().to_string(),
+ "Invalid argument error: Duplicate field keys detected: [x]"
+ );
+
+ // Valid object should succeed
+ let mut list = builder.new_list();
+ let mut valid_obj = list.new_object();
+ valid_obj.insert("m", 1);
+ valid_obj.insert("n", 2);
+
+ let valid_result = valid_obj.finish();
+ assert!(valid_result.is_ok());
+ }
}
diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs
index 07ce7b83d1..b27fca6108 100644
--- a/parquet-variant/src/to_json.rs
+++ b/parquet-variant/src/to_json.rs
@@ -859,7 +859,7 @@ mod tests {
obj.insert("age", 30i32);
obj.insert("active", true);
obj.insert("score", 95.5f64);
- obj.finish();
+ obj.finish().unwrap();
}
let (metadata, value) = builder.finish();
@@ -890,7 +890,7 @@ mod tests {
{
let obj = builder.new_object();
- obj.finish();
+ obj.finish().unwrap();
}
let (metadata, value) = builder.finish();
@@ -915,7 +915,7 @@ mod tests {
obj.insert("message", "Hello \"World\"\nWith\tTabs");
obj.insert("path", "C:\\Users\\Alice\\Documents");
obj.insert("unicode", "😀 Smiley");
- obj.finish();
+ obj.finish().unwrap();
}
let (metadata, value) = builder.finish();
@@ -1030,7 +1030,7 @@ mod tests {
obj.insert("zebra", "last");
obj.insert("alpha", "first");
obj.insert("beta", "second");
- obj.finish();
+ obj.finish().unwrap();
}
let (metadata, value) = builder.finish();
@@ -1098,7 +1098,7 @@ mod tests {
obj.insert("float_field", 2.71f64);
obj.insert("null_field", ());
obj.insert("long_field", 999i64);
- obj.finish();
+ obj.finish().unwrap();
}
let (metadata, value) = builder.finish();
diff --git a/parquet-variant/tests/variant_interop.rs
b/parquet-variant/tests/variant_interop.rs
index 20ad7899f2..dcf1200d33 100644
--- a/parquet-variant/tests/variant_interop.rs
+++ b/parquet-variant/tests/variant_interop.rs
@@ -264,7 +264,7 @@ fn variant_object_builder() {
obj.insert("null_field", ());
obj.insert("timestamp_field", "2025-04-16T12:34:56.78");
- obj.finish();
+ obj.finish().unwrap();
let (built_metadata, built_value) = builder.finish();
let actual = Variant::try_new(&built_metadata, &built_value).unwrap();