scovich commented on code in PR #8031:
URL: https://github.com/apache/arrow-rs/pull/8031#discussion_r2669004081


##########
parquet-variant/src/builder/object.rs:
##########
@@ -244,42 +284,133 @@ impl<'a, S: BuilderSpecificState> ObjectBuilder<'a, S> {
             (if is_large { 4 } else { 1 }) + // num_fields
             (num_fields * id_size as usize) + // field IDs
             ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+        // Calculated header size becomes a hint; being wrong only risks extra 
allocations.
+        // Make sure to reserve enough capacity to handle the extra bytes 
we'll truncate.
+        let mut bytes_to_splice = Vec::with_capacity(header_size + 3);
+        // let header = object_header(is_large, id_size, offset_size);
+        // bytes_to_splice.push(header);

Review Comment:
   ?



##########
parquet-variant-json/src/lib.rs:
##########
@@ -35,4 +35,5 @@ mod from_json;
 mod to_json;
 
 pub use from_json::JsonToVariant;
+pub use from_json::append_json;

Review Comment:
   ```suggestion
   pub use from_json::{append_json, JsonToVariant};
   ```
   ?



##########
parquet-variant/src/builder/object.rs:
##########
@@ -244,42 +284,133 @@ impl<'a, S: BuilderSpecificState> ObjectBuilder<'a, S> {
             (if is_large { 4 } else { 1 }) + // num_fields
             (num_fields * id_size as usize) + // field IDs
             ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+        // Calculated header size becomes a hint; being wrong only risks extra 
allocations.
+        // Make sure to reserve enough capacity to handle the extra bytes 
we'll truncate.
+        let mut bytes_to_splice = Vec::with_capacity(header_size + 3);
+        // let header = object_header(is_large, id_size, offset_size);
+        // bytes_to_splice.push(header);
+
+        match (offset_size, id_size) {
+            (1, 1) => ObjectHeaderWriter::<1, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 2) => ObjectHeaderWriter::<1, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),

Review Comment:
   This is _begging_ for a helper macro to simplify things... but I have no 
idea how to write a macro that expands a cartesian product :( 



##########
parquet-variant/src/builder/object.rs:
##########
@@ -244,42 +284,133 @@ impl<'a, S: BuilderSpecificState> ObjectBuilder<'a, S> {
             (if is_large { 4 } else { 1 }) + // num_fields
             (num_fields * id_size as usize) + // field IDs
             ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+        // Calculated header size becomes a hint; being wrong only risks extra 
allocations.
+        // Make sure to reserve enough capacity to handle the extra bytes 
we'll truncate.
+        let mut bytes_to_splice = Vec::with_capacity(header_size + 3);
+        // let header = object_header(is_large, id_size, offset_size);
+        // bytes_to_splice.push(header);
+
+        match (offset_size, id_size) {
+            (1, 1) => ObjectHeaderWriter::<1, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 2) => ObjectHeaderWriter::<1, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),

Review Comment:
   We might define a local macro that captures local state, just to reduce the 
boilerplate a bit:
   ```rust
   macro_rules! write_object_header {
       ($offset_size:ident, $id_size:ident) => {
           ObjectHeaderWriter::<$offset_size, $id_size>::write(...)
       };
   }
   
   match (offset_size, id_size) {
       (1, 1) => write_object_header!(1, 1),
       (1, 2) => write_object_header!(1, 2),
         ..
       (4, 3) => write_object_header!(4, 3),
       (4, 4) => write_object_header!(4, 4),
       _ => { ... }
   }
   ```



##########
parquet-variant/src/builder/object.rs:
##########
@@ -24,14 +24,54 @@ use crate::{
 use arrow_schema::ArrowError;
 use indexmap::IndexMap;
 
-fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 {
-    let large_bit = if large { 1 } else { 0 };
-    (large_bit << (BASIC_TYPE_BITS + 4))
-        | ((id_size - 1) << (BASIC_TYPE_BITS + 2))
-        | ((offset_size - 1) << BASIC_TYPE_BITS)
+fn object_header<const LARGE_BIT: u8, const ID_SIZE: u8, const OFFSET_SIZE: 
u8>() -> u8 {
+    (LARGE_BIT << (BASIC_TYPE_BITS + 4))
+        | ((ID_SIZE - 1) << (BASIC_TYPE_BITS + 2))
+        | ((OFFSET_SIZE - 1) << BASIC_TYPE_BITS)
         | VariantBasicType::Object as u8
 }
 
+struct ObjectHeaderWriter<const OFFSET_SIZE: u8, const ID_SIZE: u8>();
+
+impl<const OFFSET_SIZE: u8, const ID_SIZE: u8> ObjectHeaderWriter<OFFSET_SIZE, 
ID_SIZE> {
+    fn write(
+        dst: &mut Vec<u8>,
+        num_fields: usize,
+        field_ids: impl Iterator<Item = u32>,
+        offsets: impl Iterator<Item = usize>,
+        data_size: usize,
+    ) {
+        let is_large = num_fields > u8::MAX as usize;
+        match is_large {
+            true => {
+                dst.push(object_header::<1, { ID_SIZE }, { OFFSET_SIZE }>());
+                // num_fields will consume 4 bytes when it is larger than 
u8::MAX
+                append_packed_u32::<4>(dst, num_fields);
+            }
+            false => {
+                dst.push(object_header::<0, { ID_SIZE }, { OFFSET_SIZE }>());
+                append_packed_u32::<1>(dst, num_fields);
+            }
+        };

Review Comment:
   ```suggestion
           // num_fields will consume 4 bytes when it is larger than u8::MAX
           if num_fields > u8::MAX as usize {
               dst.push(object_header::<1, { ID_SIZE }, { OFFSET_SIZE }>());
               append_packed_u32::<4>(dst, num_fields);
           } else {
               dst.push(object_header::<0, { ID_SIZE }, { OFFSET_SIZE }>());
               append_packed_u32::<1>(dst, num_fields);
           }
   ```



##########
parquet-variant/src/builder/object.rs:
##########
@@ -244,42 +284,133 @@ impl<'a, S: BuilderSpecificState> ObjectBuilder<'a, S> {
             (if is_large { 4 } else { 1 }) + // num_fields
             (num_fields * id_size as usize) + // field IDs
             ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+        // Calculated header size becomes a hint; being wrong only risks extra 
allocations.
+        // Make sure to reserve enough capacity to handle the extra bytes 
we'll truncate.
+        let mut bytes_to_splice = Vec::with_capacity(header_size + 3);
+        // let header = object_header(is_large, id_size, offset_size);
+        // bytes_to_splice.push(header);
+
+        match (offset_size, id_size) {
+            (1, 1) => ObjectHeaderWriter::<1, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 2) => ObjectHeaderWriter::<1, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 3) => ObjectHeaderWriter::<1, 3>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 4) => ObjectHeaderWriter::<1, 4>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (2, 1) => ObjectHeaderWriter::<2, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (2, 2) => ObjectHeaderWriter::<2, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (2, 3) => ObjectHeaderWriter::<2, 3>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (2, 4) => ObjectHeaderWriter::<2, 4>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (3, 1) => ObjectHeaderWriter::<3, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (3, 2) => ObjectHeaderWriter::<3, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (3, 3) => ObjectHeaderWriter::<3, 3>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (3, 4) => ObjectHeaderWriter::<3, 4>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (4, 1) => ObjectHeaderWriter::<4, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (4, 2) => ObjectHeaderWriter::<4, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (4, 3) => ObjectHeaderWriter::<4, 3>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (4, 4) => ObjectHeaderWriter::<4, 4>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            _ => panic!("Unsupported offset_size/id_size combination"),

Review Comment:
   This match arm would be unnecessary if `int_size` returned `OffsetSizeBytes` 
(decoder.rs)... except I don't think const generics work with enum variants? 
We'd have to do e.g.
   ```rust
   use OffsetSizeBytes::*;
   match (offset_size, id_size) {
         ...
       (Four, Two) => ObjectHeaderWriter::<4, 2>::write(...),
         ...
   }
   ```



##########
parquet-variant/src/builder/object.rs:
##########
@@ -244,42 +284,133 @@ impl<'a, S: BuilderSpecificState> ObjectBuilder<'a, S> {
             (if is_large { 4 } else { 1 }) + // num_fields
             (num_fields * id_size as usize) + // field IDs
             ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+        // Calculated header size becomes a hint; being wrong only risks extra 
allocations.
+        // Make sure to reserve enough capacity to handle the extra bytes 
we'll truncate.
+        let mut bytes_to_splice = Vec::with_capacity(header_size + 3);
+        // let header = object_header(is_large, id_size, offset_size);
+        // bytes_to_splice.push(header);
+
+        match (offset_size, id_size) {
+            (1, 1) => ObjectHeaderWriter::<1, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 2) => ObjectHeaderWriter::<1, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 3) => ObjectHeaderWriter::<1, 3>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 4) => ObjectHeaderWriter::<1, 4>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (2, 1) => ObjectHeaderWriter::<2, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (2, 2) => ObjectHeaderWriter::<2, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (2, 3) => ObjectHeaderWriter::<2, 3>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (2, 4) => ObjectHeaderWriter::<2, 4>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (3, 1) => ObjectHeaderWriter::<3, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (3, 2) => ObjectHeaderWriter::<3, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (3, 3) => ObjectHeaderWriter::<3, 3>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (3, 4) => ObjectHeaderWriter::<3, 4>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (4, 1) => ObjectHeaderWriter::<4, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (4, 2) => ObjectHeaderWriter::<4, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (4, 3) => ObjectHeaderWriter::<4, 3>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (4, 4) => ObjectHeaderWriter::<4, 4>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            _ => panic!("Unsupported offset_size/id_size combination"),

Review Comment:
   I confirmed that enum variant casting does actually work for const generics, 
so we could also do:
   ```rust
   use OffsetSizeBytes::*;
   match (offset_size, id_size) {
         ...
       (Four, Two) => ObjectHeaderWriter::<{Four as _}, {Two as _}>::write(...),
         ...
   }
   ```



##########
parquet-variant/src/builder/object.rs:
##########
@@ -244,42 +284,133 @@ impl<'a, S: BuilderSpecificState> ObjectBuilder<'a, S> {
             (if is_large { 4 } else { 1 }) + // num_fields
             (num_fields * id_size as usize) + // field IDs
             ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+        // Calculated header size becomes a hint; being wrong only risks extra 
allocations.
+        // Make sure to reserve enough capacity to handle the extra bytes 
we'll truncate.
+        let mut bytes_to_splice = Vec::with_capacity(header_size + 3);
+        // let header = object_header(is_large, id_size, offset_size);
+        // bytes_to_splice.push(header);
+
+        match (offset_size, id_size) {
+            (1, 1) => ObjectHeaderWriter::<1, 1>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),
+            (1, 2) => ObjectHeaderWriter::<1, 2>::write(
+                &mut bytes_to_splice,
+                num_fields,
+                self.fields.keys().copied(),
+                self.fields.values().copied(),
+                data_size,
+            ),

Review Comment:
   Combining that with the other idea to have `int_size` return 
`OffsetSizeBytes` would yield:
   ```rust
   macro_rules! write_object_header {
       ($offset_size:ident, $id_size:ident) => {
           ObjectHeaderWriter::<{$offset_size as _}, {$id_size as 
_}>::write(...)
       };
   }
   
   match (offset_size, id_size) {
       (One, One) => write_object_header!(One, One),
       (One, Two) => write_object_header!(One, Two),
         ...
       (Four, Three) => write_object_header!(Four, Three),
       (Four, Four) => write_object_header!(Four, Four),
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to