alamb commented on code in PR #7653: URL: https://github.com/apache/arrow-rs/pull/7653#discussion_r2153111190
########## parquet-variant/src/builder.rs: ########## @@ -0,0 +1,757 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::Variant; +use std::collections::HashMap; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; + +fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 +} + +fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +/// Write little-endian integer to buffer +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + } +} + +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// /* TODO: uncomment this, but now VariantObject:field is not implemented +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Variant::String("Jiaying") +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Variant::String("Li") +/// ); +/// */ +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// array_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Array(variant_array) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // Verify the array contents +/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3)); +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// +pub struct VariantBuilder { + buffer: Vec<u8>, + dict: HashMap<String, u32>, + dict_keys: Vec<String>, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); + } + + fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + fn append_int16(&mut self, value: i16) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int16)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int32(&mut self, value: i32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int32)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int64(&mut self, value: i64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int64)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_float(&mut self, value: f32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Float)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_double(&mut self, value: f64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Double)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_date(&mut self, value: chrono::NaiveDate) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Date)); + let days_since_epoch = value + .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); + } + + fn append_timestamp_micros(&mut self, value: chrono::DateTime<chrono::Utc>) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampMicros)); + let micros = value.timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros)); + let micros = value.and_utc().timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_decimal4(&mut self, integer: i32, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal4)); + self.buffer.push(scale); Review Comment: We probably should (or at least have optional checking 🤔 ) ########## parquet-variant/src/builder.rs: ########## @@ -0,0 +1,757 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::Variant; +use std::collections::HashMap; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; + +fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 +} + +fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +/// Write little-endian integer to buffer +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + } +} + +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// /* TODO: uncomment this, but now VariantObject:field is not implemented +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Variant::String("Jiaying") +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Variant::String("Li") +/// ); +/// */ +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// array_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Array(variant_array) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // Verify the array contents +/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3)); +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// +pub struct VariantBuilder { + buffer: Vec<u8>, + dict: HashMap<String, u32>, + dict_keys: Vec<String>, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); + } + + fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + fn append_int16(&mut self, value: i16) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int16)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int32(&mut self, value: i32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int32)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int64(&mut self, value: i64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int64)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_float(&mut self, value: f32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Float)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_double(&mut self, value: f64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Double)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_date(&mut self, value: chrono::NaiveDate) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Date)); + let days_since_epoch = value + .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); + } + + fn append_timestamp_micros(&mut self, value: chrono::DateTime<chrono::Utc>) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampMicros)); + let micros = value.timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros)); + let micros = value.and_utc().timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_decimal4(&mut self, integer: i32, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal4)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal8(&mut self, integer: i64, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal8)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal16(&mut self, integer: i128, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal16)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_binary(&mut self, value: &[u8]) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Binary)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value); + } + + fn append_string(&mut self, value: &str) { + if value.len() <= MAX_SHORT_STRING_SIZE { + self.buffer.push(short_string_header(value.len())); + self.buffer.extend_from_slice(value.as_bytes()); + } else { + self.buffer + .push(primitive_header(VariantPrimitiveType::String)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value.as_bytes()); + } + } + + /// Add key to dictionary, return its ID + fn add_key(&mut self, key: &str) -> u32 { + use std::collections::hash_map::Entry; + match self.dict.entry(key.to_string()) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let id = self.dict_keys.len() as u32; + entry.insert(id); + self.dict_keys.push(key.to_string()); + id + } + } + } + + fn offset(&self) -> usize { + self.buffer.len() + } + + /// Create an [`ArrayBuilder`] for creating [`Variant::Array`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_array(&mut self) -> ArrayBuilder { + ArrayBuilder::new(self) + } + + /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_object(&mut self) -> ObjectBuilder { + ObjectBuilder::new(self) + } + + pub fn finish(self) -> (Vec<u8>, Vec<u8>) { Review Comment: > There's still the problem that variant can't impl From arrays/objects, but I guess we could add a call to create a new object/array builder from an existing one, in order to achieve nesting? I am not sure we will be able to implement a general purpose `From` impl for native rust slices or objects. We could potentially offer a `serde` style thing (like `serde_json` but `serde_variant` 🤔 ) ########## parquet-variant/src/builder.rs: ########## @@ -0,0 +1,737 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::Variant; +use std::collections::HashMap; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; +const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); + +fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 +} + +fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +/// Write little-endian integer to buffer +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = (value >> (i * 8)) as u8; + } +} + +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Some(Variant::ShortString("Jiaying")) +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Some(Variant::ShortString("Li")) +/// ); +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// array_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::List(variant_list) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // Verify the array contents +/// assert_eq!(variant_list.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_list.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_list.get(2).unwrap(), Variant::Int8(3)); +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// +pub struct VariantBuilder { + buffer: Vec<u8>, + dict: HashMap<String, u32>, + dict_keys: Vec<String>, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); + } + + fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + fn append_int16(&mut self, value: i16) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int16)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int32(&mut self, value: i32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int32)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int64(&mut self, value: i64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int64)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_float(&mut self, value: f32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Float)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_double(&mut self, value: f64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Double)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_date(&mut self, value: chrono::NaiveDate) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Date)); + let days_since_epoch = value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); + } + + fn append_timestamp_micros(&mut self, value: chrono::DateTime<chrono::Utc>) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampMicros)); + let micros = value.timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros)); + let micros = value.and_utc().timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_decimal4(&mut self, integer: i32, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal4)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal8(&mut self, integer: i64, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal8)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal16(&mut self, integer: i128, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal16)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_binary(&mut self, value: &[u8]) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Binary)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value); + } + + fn append_string(&mut self, value: &str) { + if value.len() <= MAX_SHORT_STRING_SIZE { + self.buffer.push(short_string_header(value.len())); + self.buffer.extend_from_slice(value.as_bytes()); + } else { + self.buffer + .push(primitive_header(VariantPrimitiveType::String)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value.as_bytes()); + } + } + + /// Add key to dictionary, return its ID + fn add_key(&mut self, key: &str) -> u32 { + use std::collections::hash_map::Entry; + match self.dict.entry(key.to_string()) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let id = self.dict_keys.len() as u32; + entry.insert(id); + self.dict_keys.push(key.to_string()); + id + } + } + } + + fn offset(&self) -> usize { + self.buffer.len() + } + + /// Create an [`ArrayBuilder`] for creating [`Variant::Array`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_array(&mut self) -> ArrayBuilder { + ArrayBuilder::new(self) + } + + /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_object(&mut self) -> ObjectBuilder { + ObjectBuilder::new(self) + } + + pub fn finish(self) -> (Vec<u8>, Vec<u8>) { + let nkeys = self.dict_keys.len(); + + // Calculate metadata size + let total_dict_size: usize = self.dict_keys.iter().map(|k| k.len()).sum(); + + // Determine appropriate offset size based on the larger of dict size or total string size + let max_offset = std::cmp::max(total_dict_size, nkeys); + let offset_size = int_size(max_offset); + + let offset_start = 1 + offset_size as usize; + let string_start = offset_start + (nkeys + 1) * offset_size as usize; + let metadata_size = string_start + total_dict_size; + + // Pre-allocate exact size to avoid reallocations + let mut metadata = vec![0u8; metadata_size]; + + // Write header: version=1, not sorted, with calculated offset_size + metadata[0] = 0x01 | ((offset_size - 1) << 6); + + // Write dictionary size + write_offset(&mut metadata[1..], nkeys, offset_size); + + // Write offsets and string data + let mut cur_offset = 0; + for (i, key) in self.dict_keys.iter().enumerate() { + write_offset( + &mut metadata[offset_start + i * offset_size as usize..], + cur_offset, + offset_size, + ); + let start = string_start + cur_offset; + metadata[start..start + key.len()].copy_from_slice(key.as_bytes()); + cur_offset += key.len(); + } + // Write final offset + write_offset( + &mut metadata[offset_start + nkeys * offset_size as usize..], + cur_offset, + offset_size, + ); + + (metadata, self.buffer) + } + + pub fn append_value<T: Into<Variant<'static, 'static>>>(&mut self, value: T) { + let variant = value.into(); + match variant { + Variant::Null => self.append_null(), + Variant::BooleanTrue => self.append_bool(true), + Variant::BooleanFalse => self.append_bool(false), + Variant::Int8(v) => self.append_int8(v), + Variant::Int16(v) => self.append_int16(v), + Variant::Int32(v) => self.append_int32(v), + Variant::Int64(v) => self.append_int64(v), + Variant::Date(v) => self.append_date(v), + Variant::TimestampMicros(v) => self.append_timestamp_micros(v), + Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v), + Variant::Decimal4 { integer, scale } => self.append_decimal4(integer, scale), + Variant::Decimal8 { integer, scale } => self.append_decimal8(integer, scale), + Variant::Decimal16 { integer, scale } => self.append_decimal16(integer, scale), + Variant::Float(v) => self.append_float(v), + Variant::Double(v) => self.append_double(v), + Variant::Binary(v) => self.append_binary(v), + Variant::String(s) | Variant::ShortString(s) => self.append_string(s), + Variant::Object(_) | Variant::List(_) => { + unreachable!("Object and Array variants cannot be created through Into<Variant>") Review Comment: FWIW I think you can append a variant object. I will write a test / file a ticket ########## parquet-variant/src/builder.rs: ########## @@ -0,0 +1,757 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::Variant; +use std::collections::HashMap; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; + +fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 +} + +fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +/// Write little-endian integer to buffer +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + } +} + +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// /* TODO: uncomment this, but now VariantObject:field is not implemented +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Variant::String("Jiaying") +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Variant::String("Li") +/// ); +/// */ +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// array_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Array(variant_array) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // Verify the array contents +/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3)); +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// +pub struct VariantBuilder { + buffer: Vec<u8>, + dict: HashMap<String, u32>, + dict_keys: Vec<String>, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); + } + + fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + fn append_int16(&mut self, value: i16) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int16)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int32(&mut self, value: i32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int32)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int64(&mut self, value: i64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int64)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_float(&mut self, value: f32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Float)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_double(&mut self, value: f64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Double)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_date(&mut self, value: chrono::NaiveDate) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Date)); + let days_since_epoch = value + .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); + } + + fn append_timestamp_micros(&mut self, value: chrono::DateTime<chrono::Utc>) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampMicros)); + let micros = value.timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros)); + let micros = value.and_utc().timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_decimal4(&mut self, integer: i32, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal4)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal8(&mut self, integer: i64, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal8)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal16(&mut self, integer: i128, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal16)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_binary(&mut self, value: &[u8]) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Binary)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value); Review Comment: I think we should throw an error if value.len() > u32::max ########## parquet-variant/src/builder.rs: ########## @@ -0,0 +1,757 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::Variant; +use std::collections::HashMap; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; + +fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 +} + +fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +/// Write little-endian integer to buffer +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + } +} + +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// /* TODO: uncomment this, but now VariantObject:field is not implemented +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Variant::String("Jiaying") +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Variant::String("Li") +/// ); +/// */ +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// array_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Array(variant_array) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // Verify the array contents +/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3)); +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// +pub struct VariantBuilder { + buffer: Vec<u8>, + dict: HashMap<String, u32>, + dict_keys: Vec<String>, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); + } + + fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + fn append_int16(&mut self, value: i16) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int16)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int32(&mut self, value: i32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int32)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int64(&mut self, value: i64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int64)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_float(&mut self, value: f32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Float)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_double(&mut self, value: f64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Double)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_date(&mut self, value: chrono::NaiveDate) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Date)); + let days_since_epoch = value + .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); + } + + fn append_timestamp_micros(&mut self, value: chrono::DateTime<chrono::Utc>) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampMicros)); + let micros = value.timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros)); + let micros = value.and_utc().timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_decimal4(&mut self, integer: i32, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal4)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal8(&mut self, integer: i64, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal8)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal16(&mut self, integer: i128, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal16)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_binary(&mut self, value: &[u8]) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Binary)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value); + } + + fn append_string(&mut self, value: &str) { + if value.len() <= MAX_SHORT_STRING_SIZE { + self.buffer.push(short_string_header(value.len())); + self.buffer.extend_from_slice(value.as_bytes()); + } else { + self.buffer + .push(primitive_header(VariantPrimitiveType::String)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value.as_bytes()); + } + } + + /// Add key to dictionary, return its ID + fn add_key(&mut self, key: &str) -> u32 { + use std::collections::hash_map::Entry; + match self.dict.entry(key.to_string()) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let id = self.dict_keys.len() as u32; + entry.insert(id); + self.dict_keys.push(key.to_string()); + id + } + } + } + + fn offset(&self) -> usize { + self.buffer.len() + } + + /// Create an [`ArrayBuilder`] for creating [`Variant::Array`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_array(&mut self) -> ArrayBuilder { + ArrayBuilder::new(self) + } + + /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_object(&mut self) -> ObjectBuilder { + ObjectBuilder::new(self) + } + + pub fn finish(self) -> (Vec<u8>, Vec<u8>) { + let nkeys = self.dict_keys.len(); + + // Calculate metadata size + let total_dict_size: usize = self.dict_keys.iter().map(|k| k.len()).sum(); + + // Determine appropriate offset size based on the larger of dict size or total string size + let max_offset = std::cmp::max(total_dict_size, nkeys); + let offset_size = int_size(max_offset); + + let offset_start = 1 + offset_size as usize; + let string_start = offset_start + (nkeys + 1) * offset_size as usize; + let metadata_size = string_start + total_dict_size; + + // Pre-allocate exact size to avoid reallocations + let mut metadata = vec![0u8; metadata_size]; + + // Write header: version=1, not sorted, with calculated offset_size + metadata[0] = 0x01 | ((offset_size - 1) << 6); + + // Write dictionary size + write_offset(&mut metadata[1..], nkeys, offset_size); + + // Write offsets and string data + let mut cur_offset = 0; + for (i, key) in self.dict_keys.iter().enumerate() { + write_offset( + &mut metadata[offset_start + i * offset_size as usize..], + cur_offset, + offset_size, + ); + let start = string_start + cur_offset; + metadata[start..start + key.len()].copy_from_slice(key.as_bytes()); + cur_offset += key.len(); + } + // Write final offset + write_offset( + &mut metadata[offset_start + nkeys * offset_size as usize..], + cur_offset, + offset_size, + ); + + (metadata, self.buffer) + } + + pub fn append_value<T: Into<Variant<'static, 'static>>>(&mut self, value: T) { Review Comment: I did so in d7d144916e ########## parquet-variant/src/builder.rs: ########## @@ -0,0 +1,757 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::Variant; +use std::collections::HashMap; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; + +fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 +} + +fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +/// Write little-endian integer to buffer +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + } +} + +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// /* TODO: uncomment this, but now VariantObject:field is not implemented +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Variant::String("Jiaying") +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Variant::String("Li") +/// ); +/// */ +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// array_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Array(variant_array) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // Verify the array contents +/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3)); +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// +pub struct VariantBuilder { + buffer: Vec<u8>, + dict: HashMap<String, u32>, + dict_keys: Vec<String>, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); + } + + fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + fn append_int16(&mut self, value: i16) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int16)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int32(&mut self, value: i32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int32)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int64(&mut self, value: i64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int64)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_float(&mut self, value: f32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Float)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_double(&mut self, value: f64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Double)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_date(&mut self, value: chrono::NaiveDate) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Date)); + let days_since_epoch = value + .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); + } + + fn append_timestamp_micros(&mut self, value: chrono::DateTime<chrono::Utc>) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampMicros)); + let micros = value.timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros)); + let micros = value.and_utc().timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_decimal4(&mut self, integer: i32, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal4)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal8(&mut self, integer: i64, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal8)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal16(&mut self, integer: i128, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal16)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_binary(&mut self, value: &[u8]) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Binary)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value); + } + + fn append_string(&mut self, value: &str) { + if value.len() <= MAX_SHORT_STRING_SIZE { + self.buffer.push(short_string_header(value.len())); + self.buffer.extend_from_slice(value.as_bytes()); + } else { + self.buffer + .push(primitive_header(VariantPrimitiveType::String)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value.as_bytes()); + } + } + + /// Add key to dictionary, return its ID + fn add_key(&mut self, key: &str) -> u32 { + use std::collections::hash_map::Entry; + match self.dict.entry(key.to_string()) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let id = self.dict_keys.len() as u32; + entry.insert(id); + self.dict_keys.push(key.to_string()); + id + } + } + } + + fn offset(&self) -> usize { + self.buffer.len() + } + + /// Create an [`ArrayBuilder`] for creating [`Variant::Array`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_array(&mut self) -> ArrayBuilder { + ArrayBuilder::new(self) + } + + /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_object(&mut self) -> ObjectBuilder { Review Comment: Maybe the builder can leave room for the list length and then append the values, and then go back and update the length when the list is finsihed This would get tricky for building "large" lists as the length field may not be known upfront. we could also introduce potentially a functon like `new_large_object()` or something for callers to hint up front their object has many fields, and if they use `new_object` but push too many values fallback to copying ########## parquet-variant/src/builder.rs: ########## @@ -0,0 +1,757 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::Variant; +use std::collections::HashMap; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; + +fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 +} + +fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +/// Write little-endian integer to buffer +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + } +} + +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// /* TODO: uncomment this, but now VariantObject:field is not implemented +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Variant::String("Jiaying") +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Variant::String("Li") +/// ); +/// */ +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// array_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Array(variant_array) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // Verify the array contents +/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3)); +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// +pub struct VariantBuilder { + buffer: Vec<u8>, + dict: HashMap<String, u32>, + dict_keys: Vec<String>, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); + } + + fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + fn append_int16(&mut self, value: i16) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int16)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int32(&mut self, value: i32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int32)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int64(&mut self, value: i64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int64)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_float(&mut self, value: f32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Float)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_double(&mut self, value: f64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Double)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_date(&mut self, value: chrono::NaiveDate) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Date)); + let days_since_epoch = value + .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); + } + + fn append_timestamp_micros(&mut self, value: chrono::DateTime<chrono::Utc>) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampMicros)); + let micros = value.timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros)); + let micros = value.and_utc().timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_decimal4(&mut self, integer: i32, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal4)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal8(&mut self, integer: i64, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal8)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal16(&mut self, integer: i128, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal16)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_binary(&mut self, value: &[u8]) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Binary)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value); + } + + fn append_string(&mut self, value: &str) { + if value.len() <= MAX_SHORT_STRING_SIZE { + self.buffer.push(short_string_header(value.len())); + self.buffer.extend_from_slice(value.as_bytes()); + } else { + self.buffer + .push(primitive_header(VariantPrimitiveType::String)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value.as_bytes()); + } + } + + /// Add key to dictionary, return its ID + fn add_key(&mut self, key: &str) -> u32 { + use std::collections::hash_map::Entry; + match self.dict.entry(key.to_string()) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let id = self.dict_keys.len() as u32; + entry.insert(id); + self.dict_keys.push(key.to_string()); + id + } + } + } + + fn offset(&self) -> usize { + self.buffer.len() + } + + /// Create an [`ArrayBuilder`] for creating [`Variant::Array`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_array(&mut self) -> ArrayBuilder { + ArrayBuilder::new(self) + } + + /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. + pub fn new_object(&mut self) -> ObjectBuilder { + ObjectBuilder::new(self) + } + + pub fn finish(self) -> (Vec<u8>, Vec<u8>) { + let nkeys = self.dict_keys.len(); + + // Calculate metadata size + let total_dict_size: usize = self.dict_keys.iter().map(|k| k.len()).sum(); + + // Determine appropriate offset size based on the larger of dict size or total string size + let max_offset = std::cmp::max(total_dict_size, nkeys); + let offset_size = int_size(max_offset); + + let offset_start = 1 + offset_size as usize; + let string_start = offset_start + (nkeys + 1) * offset_size as usize; + let metadata_size = string_start + total_dict_size; + + // Pre-allocate exact size to avoid reallocations + let mut metadata = vec![0u8; metadata_size]; + + // Write header: version=1, not sorted, with calculated offset_size + metadata[0] = 0x01 | ((offset_size - 1) << 6); + + // Write dictionary size + write_offset(&mut metadata[1..], nkeys, offset_size); + + // Write offsets and string data + let mut cur_offset = 0; + for (i, key) in self.dict_keys.iter().enumerate() { + write_offset( + &mut metadata[offset_start + i * offset_size as usize..], + cur_offset, + offset_size, + ); + let start = string_start + cur_offset; + metadata[start..start + key.len()].copy_from_slice(key.as_bytes()); + cur_offset += key.len(); + } + // Write final offset + write_offset( + &mut metadata[offset_start + nkeys * offset_size as usize..], + cur_offset, + offset_size, + ); + + (metadata, self.buffer) + } + + pub fn append_value<T: Into<Variant<'static, 'static>>>(&mut self, value: T) { + let variant = value.into(); + match variant { + Variant::Null => self.append_null(), + Variant::BooleanTrue => self.append_bool(true), + Variant::BooleanFalse => self.append_bool(false), + Variant::Int8(v) => self.append_int8(v), + Variant::Int16(v) => self.append_int16(v), + Variant::Int32(v) => self.append_int32(v), + Variant::Int64(v) => self.append_int64(v), + Variant::Date(v) => self.append_date(v), + Variant::TimestampMicros(v) => self.append_timestamp_micros(v), + Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v), + Variant::Decimal4 { integer, scale } => self.append_decimal4(integer, scale), + Variant::Decimal8 { integer, scale } => self.append_decimal8(integer, scale), + Variant::Decimal16 { integer, scale } => self.append_decimal16(integer, scale), + Variant::Float(v) => self.append_float(v), + Variant::Double(v) => self.append_double(v), + Variant::Binary(v) => self.append_binary(v), + Variant::String(s) | Variant::ShortString(s) => self.append_string(s), Review Comment: I think it would be nice to have a `self.append_short_string` and `self.append_string` and then call the appropriate ones. We could have `self.append_short_string` return an error potentially as well if the string was too long -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
