alamb commented on code in PR #7653: URL: https://github.com/apache/arrow-rs/pull/7653#discussion_r2149608322
########## parquet-variant/src/builder.rs: ########## @@ -0,0 +1,757 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::Variant; +use std::collections::HashMap; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; + +fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 +} + +fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +/// Write little-endian integer to buffer +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + } +} + +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// /* TODO: uncomment this, but now VariantObject:field is not implemented +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Variant::String("Jiaying") +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Variant::String("Li") +/// ); +/// */ +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// array_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Array(variant_array) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // Verify the array contents +/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3)); +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// +pub struct VariantBuilder { + buffer: Vec<u8>, + dict: HashMap<String, u32>, + dict_keys: Vec<String>, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); + } + + fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + fn append_int16(&mut self, value: i16) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int16)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int32(&mut self, value: i32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int32)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_int64(&mut self, value: i64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int64)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_float(&mut self, value: f32) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Float)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_double(&mut self, value: f64) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Double)); + self.buffer.extend_from_slice(&value.to_le_bytes()); + } + + fn append_date(&mut self, value: chrono::NaiveDate) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Date)); + let days_since_epoch = value + .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); + } + + fn append_timestamp_micros(&mut self, value: chrono::DateTime<chrono::Utc>) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampMicros)); + let micros = value.timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) { + self.buffer + .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros)); + let micros = value.and_utc().timestamp_micros(); + self.buffer.extend_from_slice(µs.to_le_bytes()); + } + + fn append_decimal4(&mut self, integer: i32, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal4)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal8(&mut self, integer: i64, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal8)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_decimal16(&mut self, integer: i128, scale: u8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Decimal16)); + self.buffer.push(scale); + self.buffer.extend_from_slice(&integer.to_le_bytes()); + } + + fn append_binary(&mut self, value: &[u8]) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Binary)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value); + } + + fn append_string(&mut self, value: &str) { + if value.len() <= MAX_SHORT_STRING_SIZE { + self.buffer.push(short_string_header(value.len())); + self.buffer.extend_from_slice(value.as_bytes()); + } else { + self.buffer + .push(primitive_header(VariantPrimitiveType::String)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value.as_bytes()); + } + } + + /// Add key to dictionary, return its ID + fn add_key(&mut self, key: &str) -> u32 { + use std::collections::hash_map::Entry; + match self.dict.entry(key.to_string()) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let id = self.dict_keys.len() as u32; + entry.insert(id); + self.dict_keys.push(key.to_string()); Review Comment: Maybe the dictionary could be a map to `u32` (an index into the relevant entry in the `dict_keys` Vec) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org