scovich commented on code in PR #7653:
URL: https://github.com/apache/arrow-rs/pull/7653#discussion_r2150771544


##########
parquet-variant/src/builder.rs:
##########
@@ -0,0 +1,757 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{VariantBasicType, VariantPrimitiveType};
+use crate::Variant;
+use std::collections::HashMap;
+
+const BASIC_TYPE_BITS: u8 = 2;
+const MAX_SHORT_STRING_SIZE: usize = 0x3F;
+
+fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 {
+    (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8
+}
+
+fn short_string_header(len: usize) -> u8 {
+    (len as u8) << 2 | VariantBasicType::ShortString as u8
+}
+
+fn array_header(large: bool, offset_size: u8) -> u8 {
+    let large_bit = if large { 1 } else { 0 };
+    (large_bit << (BASIC_TYPE_BITS + 2))
+        | ((offset_size - 1) << BASIC_TYPE_BITS)
+        | VariantBasicType::Array as u8
+}
+
+fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 {
+    let large_bit = if large { 1 } else { 0 };
+    (large_bit << (BASIC_TYPE_BITS + 4))
+        | ((id_size - 1) << (BASIC_TYPE_BITS + 2))
+        | ((offset_size - 1) << BASIC_TYPE_BITS)
+        | VariantBasicType::Object as u8
+}
+
+fn int_size(v: usize) -> u8 {
+    match v {
+        0..=0xFF => 1,
+        0x100..=0xFFFF => 2,
+        0x10000..=0xFFFFFF => 3,
+        _ => 4,
+    }
+}
+
+/// Write little-endian integer to buffer
+fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) {
+    for i in 0..nbytes {
+        buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8;
+    }
+}
+
+/// Helper to make room for header by moving data
+fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: 
usize) {
+    let current_len = buffer.len();
+    buffer.resize(current_len + header_size, 0);
+
+    let src_start = start_pos;
+    let src_end = current_len;
+    let dst_start = start_pos + header_size;
+
+    buffer.copy_within(src_start..src_end, dst_start);
+}
+
+/// Builder for [`Variant`] values
+///
+/// # Example: create a Primitive Int8
+/// ```
+/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata};
+/// let mut builder = VariantBuilder::new();
+/// builder.append_value(Variant::Int8(42));
+/// // Finish the builder to get the metadata and value
+/// let (metadata, value) = builder.finish();
+/// // use the Variant API to verify the result
+/// let metadata = VariantMetadata::try_new(&metadata).unwrap();
+/// let variant = Variant::try_new(&metadata, &value).unwrap();
+/// assert_eq!(variant, Variant::Int8(42));
+/// ```
+///
+/// # Example: Create an Object
+/// This example shows how to create an object with two fields:
+/// ```json
+/// {
+///  "first_name": "Jiaying",
+///  "last_name": "Li"
+/// }
+/// ```
+///
+/// ```
+/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata};
+/// let mut builder = VariantBuilder::new();
+/// // Create an object builder that will write fields to the object
+/// let mut object_builder = builder.new_object();
+/// object_builder.append_value("first_name", "Jiaying");
+/// object_builder.append_value("last_name", "Li");
+/// object_builder.finish();
+/// // Finish the builder to get the metadata and value
+/// let (metadata, value) = builder.finish();
+/// // use the Variant API to verify the result
+/// let metadata = VariantMetadata::try_new(&metadata).unwrap();
+/// let variant = Variant::try_new(&metadata, &value).unwrap();
+/// let Variant::Object(variant_object) = variant else {
+///   panic!("unexpected variant type")
+/// };
+/// /* TODO: uncomment this, but now VariantObject:field is not implemented
+/// assert_eq!(
+///   variant_object.field("first_name").unwrap(),
+///   Variant::String("Jiaying")
+/// );
+/// assert_eq!(
+///   variant_object.field("last_name").unwrap(),
+///   Variant::String("Li")
+/// );
+/// */
+/// ```
+///
+/// # Example: Create an Array
+///
+/// This example shows how to create an array of integers: `[1, 2, 3]`.
+/// ```
+///  # use parquet_variant::{Variant, VariantBuilder, VariantMetadata};
+///  let mut builder = VariantBuilder::new();
+///  // Create an array builder that will write elements to the array
+///  let mut array_builder = builder.new_array();
+///  array_builder.append_value(1i8);
+///  array_builder.append_value(2i8);
+///  array_builder.append_value(3i8);
+///  array_builder.finish();
+/// // Finish the builder to get the metadata and value
+/// let (metadata, value) = builder.finish();
+/// // use the Variant API to verify the result
+/// let metadata = VariantMetadata::try_new(&metadata).unwrap();
+/// let variant = Variant::try_new(&metadata, &value).unwrap();
+/// let Variant::Array(variant_array) = variant else {
+///   panic!("unexpected variant type")
+/// };
+/// // Verify the array contents
+/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1));
+/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2));
+/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3));
+/// ```
+///
+/// # Example: Array of objects
+///
+/// THis example shows how to create an array of objects:
+/// ```json
+/// [
+///  {
+///   "first_name": "Jiaying",
+///  "last_name": "Li"
+/// },
+///   {
+///    "first_name": "Malthe",
+///    "last_name": "Karbo"
+/// }
+/// ```
+///
+/// TODO
+///
+pub struct VariantBuilder {
+    buffer: Vec<u8>,
+    dict: HashMap<String, u32>,
+    dict_keys: Vec<String>,
+}
+
+impl VariantBuilder {
+    pub fn new() -> Self {
+        Self {
+            buffer: Vec::new(),
+            dict: HashMap::new(),
+            dict_keys: Vec::new(),
+        }
+    }
+
+    fn append_null(&mut self) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Null));
+    }
+
+    fn append_bool(&mut self, value: bool) {
+        let primitive_type = if value {
+            VariantPrimitiveType::BooleanTrue
+        } else {
+            VariantPrimitiveType::BooleanFalse
+        };
+        self.buffer.push(primitive_header(primitive_type));
+    }
+
+    fn append_int8(&mut self, value: i8) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Int8));
+        self.buffer.push(value as u8);
+    }
+
+    fn append_int16(&mut self, value: i16) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Int16));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_int32(&mut self, value: i32) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Int32));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_int64(&mut self, value: i64) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Int64));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_float(&mut self, value: f32) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Float));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_double(&mut self, value: f64) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Double));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_date(&mut self, value: chrono::NaiveDate) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Date));
+        let days_since_epoch = value
+            .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 
1).unwrap())
+            .num_days() as i32;
+        self.buffer
+            .extend_from_slice(&days_since_epoch.to_le_bytes());
+    }
+
+    fn append_timestamp_micros(&mut self, value: 
chrono::DateTime<chrono::Utc>) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::TimestampMicros));
+        let micros = value.timestamp_micros();
+        self.buffer.extend_from_slice(&micros.to_le_bytes());
+    }
+
+    fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros));
+        let micros = value.and_utc().timestamp_micros();
+        self.buffer.extend_from_slice(&micros.to_le_bytes());
+    }
+
+    fn append_decimal4(&mut self, integer: i32, scale: u8) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Decimal4));
+        self.buffer.push(scale);
+        self.buffer.extend_from_slice(&integer.to_le_bytes());
+    }
+
+    fn append_decimal8(&mut self, integer: i64, scale: u8) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Decimal8));
+        self.buffer.push(scale);
+        self.buffer.extend_from_slice(&integer.to_le_bytes());
+    }
+
+    fn append_decimal16(&mut self, integer: i128, scale: u8) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Decimal16));
+        self.buffer.push(scale);
+        self.buffer.extend_from_slice(&integer.to_le_bytes());
+    }
+
+    fn append_binary(&mut self, value: &[u8]) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Binary));
+        self.buffer
+            .extend_from_slice(&(value.len() as u32).to_le_bytes());
+        self.buffer.extend_from_slice(value);
+    }
+
+    fn append_string(&mut self, value: &str) {
+        if value.len() <= MAX_SHORT_STRING_SIZE {
+            self.buffer.push(short_string_header(value.len()));
+            self.buffer.extend_from_slice(value.as_bytes());
+        } else {
+            self.buffer
+                .push(primitive_header(VariantPrimitiveType::String));
+            self.buffer
+                .extend_from_slice(&(value.len() as u32).to_le_bytes());
+            self.buffer.extend_from_slice(value.as_bytes());
+        }
+    }
+
+    /// Add key to dictionary, return its ID
+    fn add_key(&mut self, key: &str) -> u32 {
+        use std::collections::hash_map::Entry;
+        match self.dict.entry(key.to_string()) {
+            Entry::Occupied(entry) => *entry.get(),
+            Entry::Vacant(entry) => {
+                let id = self.dict_keys.len() as u32;
+                entry.insert(id);
+                self.dict_keys.push(key.to_string());

Review Comment:
   Storing offset pairs would also work really nicely if we decide not to 
sort+dedup the `metadata` field entries (**), because then the `Vec<u8>` can 
simply be copied as-is to its final home. 
   
   (**) Sort+dedup is actually quite some work, because it requires finding and 
rewriting all field ids scattered through the `value` bytes. So it would almost 
certainly be a second step (either a preliminary pass before we actually 
generate the variant value bytes, or as a fix-up pass afterward to rewrite 
field ids)



##########
parquet-variant/src/builder.rs:
##########
@@ -0,0 +1,757 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use crate::decoder::{VariantBasicType, VariantPrimitiveType};
+use crate::Variant;
+use std::collections::HashMap;
+
+const BASIC_TYPE_BITS: u8 = 2;
+const MAX_SHORT_STRING_SIZE: usize = 0x3F;
+
+fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 {
+    (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8
+}
+
+fn short_string_header(len: usize) -> u8 {
+    (len as u8) << 2 | VariantBasicType::ShortString as u8
+}
+
+fn array_header(large: bool, offset_size: u8) -> u8 {
+    let large_bit = if large { 1 } else { 0 };
+    (large_bit << (BASIC_TYPE_BITS + 2))
+        | ((offset_size - 1) << BASIC_TYPE_BITS)
+        | VariantBasicType::Array as u8
+}
+
+fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 {
+    let large_bit = if large { 1 } else { 0 };
+    (large_bit << (BASIC_TYPE_BITS + 4))
+        | ((id_size - 1) << (BASIC_TYPE_BITS + 2))
+        | ((offset_size - 1) << BASIC_TYPE_BITS)
+        | VariantBasicType::Object as u8
+}
+
+fn int_size(v: usize) -> u8 {
+    match v {
+        0..=0xFF => 1,
+        0x100..=0xFFFF => 2,
+        0x10000..=0xFFFFFF => 3,
+        _ => 4,
+    }
+}
+
+/// Write little-endian integer to buffer
+fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) {
+    for i in 0..nbytes {
+        buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8;
+    }
+}
+
+/// Helper to make room for header by moving data
+fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: 
usize) {
+    let current_len = buffer.len();
+    buffer.resize(current_len + header_size, 0);
+
+    let src_start = start_pos;
+    let src_end = current_len;
+    let dst_start = start_pos + header_size;
+
+    buffer.copy_within(src_start..src_end, dst_start);
+}
+
+/// Builder for [`Variant`] values
+///
+/// # Example: create a Primitive Int8
+/// ```
+/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata};
+/// let mut builder = VariantBuilder::new();
+/// builder.append_value(Variant::Int8(42));
+/// // Finish the builder to get the metadata and value
+/// let (metadata, value) = builder.finish();
+/// // use the Variant API to verify the result
+/// let metadata = VariantMetadata::try_new(&metadata).unwrap();
+/// let variant = Variant::try_new(&metadata, &value).unwrap();
+/// assert_eq!(variant, Variant::Int8(42));
+/// ```
+///
+/// # Example: Create an Object
+/// This example shows how to create an object with two fields:
+/// ```json
+/// {
+///  "first_name": "Jiaying",
+///  "last_name": "Li"
+/// }
+/// ```
+///
+/// ```
+/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata};
+/// let mut builder = VariantBuilder::new();
+/// // Create an object builder that will write fields to the object
+/// let mut object_builder = builder.new_object();
+/// object_builder.append_value("first_name", "Jiaying");
+/// object_builder.append_value("last_name", "Li");
+/// object_builder.finish();
+/// // Finish the builder to get the metadata and value
+/// let (metadata, value) = builder.finish();
+/// // use the Variant API to verify the result
+/// let metadata = VariantMetadata::try_new(&metadata).unwrap();
+/// let variant = Variant::try_new(&metadata, &value).unwrap();
+/// let Variant::Object(variant_object) = variant else {
+///   panic!("unexpected variant type")
+/// };
+/// /* TODO: uncomment this, but now VariantObject:field is not implemented
+/// assert_eq!(
+///   variant_object.field("first_name").unwrap(),
+///   Variant::String("Jiaying")
+/// );
+/// assert_eq!(
+///   variant_object.field("last_name").unwrap(),
+///   Variant::String("Li")
+/// );
+/// */
+/// ```
+///
+/// # Example: Create an Array
+///
+/// This example shows how to create an array of integers: `[1, 2, 3]`.
+/// ```
+///  # use parquet_variant::{Variant, VariantBuilder, VariantMetadata};
+///  let mut builder = VariantBuilder::new();
+///  // Create an array builder that will write elements to the array
+///  let mut array_builder = builder.new_array();
+///  array_builder.append_value(1i8);
+///  array_builder.append_value(2i8);
+///  array_builder.append_value(3i8);
+///  array_builder.finish();
+/// // Finish the builder to get the metadata and value
+/// let (metadata, value) = builder.finish();
+/// // use the Variant API to verify the result
+/// let metadata = VariantMetadata::try_new(&metadata).unwrap();
+/// let variant = Variant::try_new(&metadata, &value).unwrap();
+/// let Variant::Array(variant_array) = variant else {
+///   panic!("unexpected variant type")
+/// };
+/// // Verify the array contents
+/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1));
+/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2));
+/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3));
+/// ```
+///
+/// # Example: Array of objects
+///
+/// THis example shows how to create an array of objects:
+/// ```json
+/// [
+///  {
+///   "first_name": "Jiaying",
+///  "last_name": "Li"
+/// },
+///   {
+///    "first_name": "Malthe",
+///    "last_name": "Karbo"
+/// }
+/// ```
+///
+/// TODO
+///
+pub struct VariantBuilder {
+    buffer: Vec<u8>,
+    dict: HashMap<String, u32>,
+    dict_keys: Vec<String>,
+}
+
+impl VariantBuilder {
+    pub fn new() -> Self {
+        Self {
+            buffer: Vec::new(),
+            dict: HashMap::new(),
+            dict_keys: Vec::new(),
+        }
+    }
+
+    fn append_null(&mut self) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Null));
+    }
+
+    fn append_bool(&mut self, value: bool) {
+        let primitive_type = if value {
+            VariantPrimitiveType::BooleanTrue
+        } else {
+            VariantPrimitiveType::BooleanFalse
+        };
+        self.buffer.push(primitive_header(primitive_type));
+    }
+
+    fn append_int8(&mut self, value: i8) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Int8));
+        self.buffer.push(value as u8);
+    }
+
+    fn append_int16(&mut self, value: i16) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Int16));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_int32(&mut self, value: i32) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Int32));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_int64(&mut self, value: i64) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Int64));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_float(&mut self, value: f32) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Float));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_double(&mut self, value: f64) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Double));
+        self.buffer.extend_from_slice(&value.to_le_bytes());
+    }
+
+    fn append_date(&mut self, value: chrono::NaiveDate) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Date));
+        let days_since_epoch = value
+            .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 
1).unwrap())
+            .num_days() as i32;
+        self.buffer
+            .extend_from_slice(&days_since_epoch.to_le_bytes());
+    }
+
+    fn append_timestamp_micros(&mut self, value: 
chrono::DateTime<chrono::Utc>) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::TimestampMicros));
+        let micros = value.timestamp_micros();
+        self.buffer.extend_from_slice(&micros.to_le_bytes());
+    }
+
+    fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros));
+        let micros = value.and_utc().timestamp_micros();
+        self.buffer.extend_from_slice(&micros.to_le_bytes());
+    }
+
+    fn append_decimal4(&mut self, integer: i32, scale: u8) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Decimal4));
+        self.buffer.push(scale);
+        self.buffer.extend_from_slice(&integer.to_le_bytes());
+    }
+
+    fn append_decimal8(&mut self, integer: i64, scale: u8) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Decimal8));
+        self.buffer.push(scale);
+        self.buffer.extend_from_slice(&integer.to_le_bytes());
+    }
+
+    fn append_decimal16(&mut self, integer: i128, scale: u8) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Decimal16));
+        self.buffer.push(scale);
+        self.buffer.extend_from_slice(&integer.to_le_bytes());
+    }
+
+    fn append_binary(&mut self, value: &[u8]) {
+        self.buffer
+            .push(primitive_header(VariantPrimitiveType::Binary));
+        self.buffer
+            .extend_from_slice(&(value.len() as u32).to_le_bytes());
+        self.buffer.extend_from_slice(value);
+    }
+
+    fn append_string(&mut self, value: &str) {
+        if value.len() <= MAX_SHORT_STRING_SIZE {
+            self.buffer.push(short_string_header(value.len()));
+            self.buffer.extend_from_slice(value.as_bytes());
+        } else {
+            self.buffer
+                .push(primitive_header(VariantPrimitiveType::String));
+            self.buffer
+                .extend_from_slice(&(value.len() as u32).to_le_bytes());
+            self.buffer.extend_from_slice(value.as_bytes());
+        }
+    }
+
+    /// Add key to dictionary, return its ID
+    fn add_key(&mut self, key: &str) -> u32 {
+        use std::collections::hash_map::Entry;
+        match self.dict.entry(key.to_string()) {
+            Entry::Occupied(entry) => *entry.get(),
+            Entry::Vacant(entry) => {
+                let id = self.dict_keys.len() as u32;
+                entry.insert(id);
+                self.dict_keys.push(key.to_string());

Review Comment:
   Storing offset pairs directly would also work really nicely if we decide not 
to sort+dedup the `metadata` field entries (**), because then the `Vec<u8>` can 
simply be copied as-is to its final home. 
   
   (**) Sort+dedup is actually quite some work, because it requires finding and 
rewriting all field ids scattered through the `value` bytes. So it would almost 
certainly be a second step (either a preliminary pass before we actually 
generate the variant value bytes, or as a fix-up pass afterward to rewrite 
field ids)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to