alamb commented on code in PR #7833:
URL: https://github.com/apache/arrow-rs/pull/7833#discussion_r2176152402


##########
parquet-variant/src/builder.rs:
##########
@@ -1321,4 +1353,108 @@ mod tests {
         assert_eq!(outer_object.field_name(1).unwrap(), "b");
         assert_eq!(outer_object.field(1).unwrap(), Variant::from(true));
     }
+
+    #[test]
+    fn test_sorted_dictionary() {
+        // check if variant metadatabuilders are equivalent from different 
ways of constructing them
+        let mut variant1 = VariantBuilder::new().with_field_names(&["b", "c", 
"d"]);
+
+        let mut variant2 = {
+            let mut builder = VariantBuilder::new();
+
+            builder.add_field_name("b");
+            builder.add_field_name("c");
+            builder.add_field_name("d");
+
+            builder
+        };
+
+        assert_eq!(
+            variant1.metadata_builder.field_names,
+            variant2.metadata_builder.field_names
+        );
+
+        // check metadata builders say it's sorted
+        assert!(variant1.metadata_builder.is_sorted());
+        assert!(variant2.metadata_builder.is_sorted());
+
+        {
+            // test the bad case and break the sort order
+            variant2.add_field_name("a");
+            assert!(!variant2.metadata_builder.is_sorted());
+
+            // per the spec, make sure the variant will fail to build if only 
metadata is provided
+            let (m, v) = variant2.finish();
+            let res = Variant::try_new(&m, &v);
+            assert!(res.is_err());
+
+            // since it is not sorted, make sure the metadata says so
+            let header = VariantMetadata::try_new(&m).unwrap();
+            assert!(!header.is_sorted());
+        }
+
+        // write out variant1 and make sure the sorted flag is properly encoded
+        variant1.append_value(false);
+
+        let (m, v) = variant1.finish();
+        let res = Variant::try_new(&m, &v);
+        assert!(res.is_ok());
+
+        let header = VariantMetadata::try_new(&m).unwrap();
+        assert!(header.is_sorted());
+    }
+
+    #[test]
+    fn test_object_sorted_dictionary() {
+        // predefine the list of field names
+        let mut variant1 = VariantBuilder::new().with_field_names(&["a", "b", 
"c"]);
+        let mut obj = variant1.new_object();
+
+        obj.insert("c", true);
+        obj.insert("a", false);
+        obj.insert("b", ());
+
+        // verify the field ids are correctly
+        let field_ids_by_insert_order = obj.fields.iter().map(|(&id, _)| 
id).collect::<Vec<_>>();
+        assert_eq!(field_ids_by_insert_order, vec![2, 0, 1]);
+
+        // add a field name that wasn't pre-defined but doesn't break the sort 
order
+        obj.insert("d", 2);
+        let field_ids_by_insert_order = obj.fields.iter().map(|(&id, _)| 
id).collect::<Vec<_>>();
+        assert_eq!(field_ids_by_insert_order, vec![2, 0, 1, 3]);
+
+        obj.finish();
+        let (metadata, value) = variant1.finish();
+        let _ = Variant::try_new(&metadata, &value).unwrap();
+
+        let metadata = VariantMetadata::try_new(&metadata).unwrap();
+        assert!(metadata.is_sorted());
+    }
+
+    #[test]
+    fn test_object_not_sorted_dictionary() {
+        // predefine the list of field names
+        let mut variant1 = VariantBuilder::new().with_field_names(&["b", "c", 
"d"]);
+        let mut obj = variant1.new_object();
+
+        obj.insert("c", true);
+        obj.insert("d", false);
+        obj.insert("b", ());
+
+        // verify the field ids are correctly
+        let field_ids_by_insert_order = obj.fields.iter().map(|(&id, _)| 
id).collect::<Vec<_>>();
+        assert_eq!(field_ids_by_insert_order, vec![1, 2, 0]);
+
+        // add a field name that wasn't pre-defined but breaks the sort order

Review Comment:
   likewise here, I don't think checking the field ids is important.



##########
parquet-variant/src/builder.rs:
##########
@@ -449,6 +463,22 @@ impl VariantBuilder {
         }
     }
 
+    /// Pre-populates the [`MetadataBuilder`] with field names in the given 
order.
+    ///
+    /// This method should be used when creating sorted dictionaries and field 
names are known ahead of time.

Review Comment:
   It might help to give some more context here like
   
   ```rust
   /// This method pre-populates the field name dictionary in the Variant 
Metadata 
   /// with the specified field names, in order. 
   ///
   /// You can use this to prepopulate a VariantBuilder with a sorted 
dictionary if 
   /// you know the field names before hand somehow. Sorted dictionaries can 
accelerate
   /// field access when reading Variants. 
   ```
   
   
   I also think it would be great to add a note and a link to the documentation 
of `VariantBuilder::build` telling about this method
   
   ```rust
   ...
   /// Note: the builder checks if the metadata dictionary is sorted and if it 
is
   /// marks the metadata as sorted, which can accelerate reading Variant 
values.
   /// Field names are added to the metadata dictionary in the order they are 
first 
   /// encoutered. See [`Self::with_field_names`] to pre-populate field names 
in 
   /// order if you know beforehand the fields that can appear in your objects
   fn build(self) -> ...
   ```
   
   



##########
parquet-variant/src/builder.rs:
##########
@@ -233,29 +233,38 @@ impl ValueBuffer {
 
 #[derive(Default)]
 struct MetadataBuilder {
-    field_name_to_id: BTreeMap<String, u32>,
-    field_names: Vec<String>,
+    // Field names -- field_ids are assigned in insert order
+    field_names: IndexSet<String>,
 }
 
 impl MetadataBuilder {
+    /// Pre-populates the list of field names
+    fn from_field_names(field_name: &[&str]) -> Self {
+        Self {
+            field_names: IndexSet::from_iter(field_name.iter().map(|f| 
f.to_string())),
+        }
+    }
+
+    /// Checks whether field names by insertion order is lexicographically 
sorted
+    fn is_sorted(&self) -> bool {
+        !self.field_names.is_empty() && self.field_names.iter().is_sorted()

Review Comment:
   It makes sense to me



##########
parquet-variant/src/builder.rs:
##########
@@ -449,6 +463,22 @@ impl VariantBuilder {
         }
     }
 
+    /// Pre-populates the [`MetadataBuilder`] with field names in the given 
order.
+    ///
+    /// This method should be used when creating sorted dictionaries and field 
names are known ahead of time.
+    pub fn with_field_names(mut self, field_names: &[&str]) -> Self {

Review Comment:
   I recommend making this generic to make it eaiser to pass in 
   
   for example
   ```suggestion
       pub fn with_field_names(mut self, field_names: impl IntoIterator<Item = 
&str>) -> Self {
   ```
   
   



##########
parquet-variant/src/builder.rs:
##########
@@ -1321,4 +1353,108 @@ mod tests {
         assert_eq!(outer_object.field_name(1).unwrap(), "b");
         assert_eq!(outer_object.field(1).unwrap(), Variant::from(true));
     }
+
+    #[test]
+    fn test_sorted_dictionary() {
+        // check if variant metadatabuilders are equivalent from different 
ways of constructing them
+        let mut variant1 = VariantBuilder::new().with_field_names(&["b", "c", 
"d"]);
+
+        let mut variant2 = {
+            let mut builder = VariantBuilder::new();
+
+            builder.add_field_name("b");
+            builder.add_field_name("c");
+            builder.add_field_name("d");
+
+            builder
+        };
+
+        assert_eq!(
+            variant1.metadata_builder.field_names,
+            variant2.metadata_builder.field_names
+        );
+
+        // check metadata builders say it's sorted
+        assert!(variant1.metadata_builder.is_sorted());
+        assert!(variant2.metadata_builder.is_sorted());
+
+        {
+            // test the bad case and break the sort order
+            variant2.add_field_name("a");
+            assert!(!variant2.metadata_builder.is_sorted());
+
+            // per the spec, make sure the variant will fail to build if only 
metadata is provided
+            let (m, v) = variant2.finish();
+            let res = Variant::try_new(&m, &v);
+            assert!(res.is_err());
+
+            // since it is not sorted, make sure the metadata says so
+            let header = VariantMetadata::try_new(&m).unwrap();
+            assert!(!header.is_sorted());
+        }
+
+        // write out variant1 and make sure the sorted flag is properly encoded
+        variant1.append_value(false);
+
+        let (m, v) = variant1.finish();
+        let res = Variant::try_new(&m, &v);
+        assert!(res.is_ok());
+
+        let header = VariantMetadata::try_new(&m).unwrap();
+        assert!(header.is_sorted());
+    }
+
+    #[test]
+    fn test_object_sorted_dictionary() {
+        // predefine the list of field names
+        let mut variant1 = VariantBuilder::new().with_field_names(&["a", "b", 
"c"]);
+        let mut obj = variant1.new_object();
+
+        obj.insert("c", true);
+        obj.insert("a", false);
+        obj.insert("b", ());
+
+        // verify the field ids are correctly
+        let field_ids_by_insert_order = obj.fields.iter().map(|(&id, _)| 
id).collect::<Vec<_>>();
+        assert_eq!(field_ids_by_insert_order, vec![2, 0, 1]);
+
+        // add a field name that wasn't pre-defined but doesn't break the sort 
order
+        obj.insert("d", 2);
+        let field_ids_by_insert_order = obj.fields.iter().map(|(&id, _)| 
id).collect::<Vec<_>>();
+        assert_eq!(field_ids_by_insert_order, vec![2, 0, 1, 3]);
+
+        obj.finish();
+        let (metadata, value) = variant1.finish();
+        let _ = Variant::try_new(&metadata, &value).unwrap();

Review Comment:
   I am don't think it is important to verify these internal details of the 
field_ids -- they key thing is to verify that the object is sorted if the keys 
are pre-added in sorted order



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to