alamb commented on code in PR #7535: URL: https://github.com/apache/arrow-rs/pull/7535#discussion_r2103135519
########## parquet-variant/src/decoder.rs: ########## @@ -0,0 +1,149 @@ +// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 +// And the feedback there. +use arrow_schema::ArrowError; +use std::{array::TryFromSliceError, str}; + +use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, slice_from_slice}; + +#[derive(Debug, Clone, Copy)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + +#[derive(Debug, Clone, Copy)] +pub enum VariantPrimitiveType { + Null = 0, + BooleanTrue = 1, + BooleanFalse = 2, + Int8 = 3, + // TODO: Add types for the rest of primitives, once API is agreed upon + String = 16, +} + +/// Extracts the basic type from a header byte +pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits + let basic_type = match basic_type { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown basic type: {}", + basic_type + ))) + } + }; + Ok(basic_type) +} + +/// Extracts the primitive type from a header byte +pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { Review Comment: @scovich 's comments got me thinking we could implement this as `impl TryFrom<u8> for VariantPrimitiveType` Which I think would be more "idomatic" rust and would make for better look. Similarly for other APIs in this module ########## parquet-variant/src/test_variant.rs: ########## @@ -0,0 +1,72 @@ +//! End-to-end check: (almost) every sample from apache/parquet-testing/variant +//! can be parsed into our `Variant`. + +// NOTE: We keep this file separate rather than a test mod inside variant.rs because it should be +// moved to the test folder later +use std::fs; +use std::path::{Path, PathBuf}; + +use crate::variant::{Variant, VariantMetadata}; +use arrow_schema::ArrowError; + +fn cases_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("parquet-testing") + .join("variant") +} + +fn load_case(name: &str) -> Result<(Vec<u8>, Vec<u8>), ArrowError> { + let root = cases_dir(); + let meta = fs::read(root.join(format!("{name}.metadata")))?; + let val = fs::read(root.join(format!("{name}.value")))?; + Ok((meta, val)) +} + +fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { + vec![ + ("primitive_boolean_false", Variant::BooleanFalse), + ("primitive_boolean_true", Variant::BooleanTrue), + ("primitive_int8", Variant::Int8(42)), + // Using the From<String> trait + ("primitive_string", Variant::from("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")), + // Using the From<String> trait + ("short_string", Variant::from("Less than 64 bytes (❤\u{fe0f} with utf8)")), + // TODO Reenable when https://github.com/apache/parquet-testing/issues/81 is fixed Review Comment: BTW I made a PR to fix this - https://github.com/apache/parquet-testing/pull/84 ########## parquet-variant/src/decoder.rs: ########## @@ -0,0 +1,199 @@ +// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 +// And the feedback there. +use crate::variant::VariantType; +use arrow_schema::ArrowError; +use std::{array::TryFromSliceError, str}; + +#[derive(Debug, Clone, Copy)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + +#[derive(Debug, Clone, Copy)] +pub enum VariantPrimitiveType { + Null = 0, + BooleanTrue = 1, + BooleanFalse = 2, + Int8 = 3, + // TODO: Add 'legs' for the rest of primitives, once API is agreed upon + String = 16, +} + +/// Extracts the basic type from a header byte +pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits + let basic_type = match basic_type { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown basic type: {}", + basic_type + ))) + } + }; + Ok(basic_type) +} + +/// Extracts the primitive type from a header byte +pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + //// Primitive type is encoded in the last 6 bits of the header byte + let primitive_type = (header >> 2) & 0x3F; + let primitive_type = match primitive_type { + 0 => VariantPrimitiveType::Null, + 1 => VariantPrimitiveType::BooleanTrue, + 2 => VariantPrimitiveType::BooleanFalse, + 3 => VariantPrimitiveType::Int8, + // TODO: Add 'legs' for the rest, once API is agreed upon + 16 => VariantPrimitiveType::String, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown primitive type: {}", + primitive_type + ))) + } + }; + Ok(primitive_type) +} + +/// Extracts the variant type from the value section of a variant. The variant +/// type is defined as the set of all basic types and all primitive types. +pub fn get_variant_type(value: &[u8]) -> Result<VariantType, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to get variant type from empty buffer array".to_string(), + )); + } + let header = value[0]; + let variant_type = match get_basic_type(header)? { + VariantBasicType::Primitive => match get_primitive_type(header)? { + VariantPrimitiveType::Null => VariantType::Null, + VariantPrimitiveType::Int8 => VariantType::Int8, + VariantPrimitiveType::BooleanTrue => VariantType::BooleanTrue, + VariantPrimitiveType::BooleanFalse => VariantType::BooleanFalse, + // TODO: Add 'legs' for the rest, once API is agreed upon + VariantPrimitiveType::String => VariantType::String, + }, + VariantBasicType::ShortString => VariantType::ShortString, + VariantBasicType::Object => VariantType::Object, + VariantBasicType::Array => VariantType::Array, + }; + Ok(variant_type) +} + +/// To be used in `map_err` when unpacking an integer from a slice of bytes. +fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { + ArrowError::InvalidArgumentError(e.to_string()) +} + +/// Constructs the error message for an invalid UTF-8 string. +fn invalid_utf8_err() -> ArrowError { + ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()) +} + +/// Decodes an Int8 from the value section of a variant. +pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Got empty value buffer so can't decode into int8.".to_string(), + )); + } + let value = i8::from_le_bytes([value[1]]); + Ok(value) +} + +/// Decodes a long string from the value section of a variant. +pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { + if value.len() < 5 { + return Err(ArrowError::InvalidArgumentError( + "Tried to decode value buffer into long_string, but it's too short (len<5)." + .to_string(), + )); + } + let len = + u32::from_le_bytes(value[1..=4].try_into().map_err(map_try_from_slice_error)?) as usize; + if value.len() < len + 5 { + let err_str = format!("The length of the buffer for the long_string is soo short, it is {} and it should be at least {} ({} < {} + 5)", value.len(), len + 5 , value.len(), len); + return Err(ArrowError::InvalidArgumentError(err_str)); + } + let string_bytes = &value[5..5 + len]; + let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?; + Ok(string) +} + +/// Decodes a short string from the value section of a variant. +pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to decode value buffer into short_string, but it's empty.".to_string(), + )); + } + let len = ((value[0] & 0b11111100) >> 2) as usize; + + if value.len() < len + 1 { + let err_str = format!("The length of the buffer for the short_string is too short, it is {} and it should be at least {} ({} < {} + 1)", value.len(), len + 1 , value.len(), len); + return Err(ArrowError::InvalidArgumentError(err_str)); + } + let string_bytes = &value[1..1 + len]; + let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?; + Ok(string) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_i8() -> Result<(), ArrowError> { + let value = [ + 0 | 3 << 2, // Primitive type for i8 + 42, + ]; + let result = decode_int8(&value)?; + assert_eq!(result, 42); + Ok(()) + } + + #[test] + fn test_short_string() -> Result<(), ArrowError> { + let value = [ + 1 | 5 << 2, // Basic type for short string | length of short string + 'H' as u8, + 'e' as u8, + 'l' as u8, + 'l' as u8, + 'o' as u8, + 'o' as u8, + ]; + let result = decode_short_string(&value)?; + assert_eq!(result, "Hello"); + Ok(()) + } + + #[test] + fn test_string() -> Result<(), ArrowError> { Review Comment: I think they are ok, maybe we just shouldn't go too crazy. The other thing we could do is to make them examples in the documentation -- which would help both users and maintainers. ########## parquet-variant/src/variant.rs: ########## @@ -0,0 +1,321 @@ +use std::{borrow::Cow, ops::Index}; + +use crate::decoder::{self, get_variant_type}; +use arrow_schema::ArrowError; +use strum_macros::EnumDiscriminants; + +#[derive(Clone, Copy, Debug, PartialEq)] +/// Encodes the Variant Metadata, see the Variant spec file for more information +pub struct VariantMetadata<'m> { + bytes: &'m [u8], +} + +impl<'m> VariantMetadata<'m> { + /// View the raw bytes (needed by very low-level decoders) + #[inline] + pub const fn as_bytes(&self) -> &'m [u8] { + self.bytes + } + + /// Whether the dictionary keys are sorted and unique + pub fn is_sorted(&self) -> bool { + todo!() + } + + /// Get the dict length + pub fn dict_len(&self) -> Result<usize, ArrowError> { + let end_location = self.offset_size()? as usize + 1; + if self.bytes.len() < end_location { + let err_str = format!( + "Invalid metadata bytes, must have at least length {} but has length {}", + &end_location, + self.bytes.len() + ); + return Err(ArrowError::InvalidArgumentError(err_str)); + } + let dict_len_bytes = &self.bytes[1..end_location]; + let dict_len = usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Unable to convert dictionary_size bytes into usize: {}", + e, + )) + })?); + Ok(dict_len) + } + pub fn version(&self) -> usize { + todo!() + } + + /// Get the offset by index + pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> { + todo!() + } + + /// Get the header byte, which has the following form + /// 7 6 5 4 3 0 + /// +-------+---+---+---------------+ + /// header | | | | version | + /// +-------+---+---+---------------+ + /// ^ ^ + /// | +-- sorted_strings + /// +-- offset_size_minus_one + /// The version is a 4-bit value that must always contain the value 1. + /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. + /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. + /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 + pub fn header(&self) -> Result<u8, ArrowError> { + if self.bytes.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Can't get header from empty buffer".to_string(), + )); + } + Ok(self.bytes[0]) + } + + /// Get the offset_minus_one value from the header + pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> { + if self.bytes.is_empty() { + Err(ArrowError::InvalidArgumentError( + "Tried to get offset_size_minus_one from header, but self.bytes buffer is emtpy." + .to_string(), + )) + } else { + Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits + } + } + + /// Get the offset_size + pub fn offset_size(&self) -> Result<u8, ArrowError> { + Ok(self.offset_size_minus_one()? + 1) + } + + /// Get the offsets as an iterator + // TODO: Do we want this kind of API? + // TODO: Test once API is agreed upon + pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 'm, ArrowError> { + struct OffsetIterators<'m> { + buffer: &'m [u8], + total: usize, + seen: usize, + offset_size: usize, + } + impl<'m> Iterator for OffsetIterators<'m> { + type Item = (usize, usize); // (start, end) positions of the bytes + + // TODO: Check bounds here or ensure they're correct + + fn next(&mut self) -> Option<Self::Item> { + // +1 to skip the first offset + if self.seen < self.total { + let start = usize::from_le_bytes( + self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header + ..(self.seen ) * self.offset_size + 1] + .try_into() + .ok()?, + ); + self.seen += 1; + let end = usize::from_le_bytes( + self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header + ..(self.seen ) * self.offset_size + 1] + .try_into() + .ok()?, + ); + + Some((start, end)) + } else { + None + } + } + } + let iterator: OffsetIterators = OffsetIterators { + buffer: self.bytes, + total: self.dict_len()?, + seen: 0, + offset_size: self.offset_size()? as usize, + }; + Ok(iterator) + } + /// Get the key-name-bytes by index + pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> { + todo!() + } + /// Get all key-names as an Iterator of strings + // TODO: Result + pub fn fields(&self) -> impl Iterator<Item = &'m str> { + // Do the same as for offsets + todo!(); + vec![].into_iter() + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VariantObject<'m, 'v> { + pub metadata: VariantMetadata<'m>, + pub value: &'v [u8], +} + +impl<'m, 'v> VariantObject<'m, 'v> { + pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 'v>)>, ArrowError> { + todo!(); + #[allow(unreachable_code)] // Just to infer the return type + Ok(vec![].into_iter()) + } + + pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> { + todo!() + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VariantArray<'m, 'v> { + pub metadata: VariantMetadata<'m>, + pub value: &'v [u8], +} + +// TODO: Let's agree on the API here, also should we expose a way to get the values as a vec of +// variants for those who want it? Would require allocations. +impl<'m, 'v> VariantArray<'m, 'v> { + pub fn len(&self) -> usize { + todo!() + } + + pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, ArrowError> { + todo!(); + #[allow(unreachable_code)] // Just to infer the return type + Ok(vec![].into_iter()) + } +} + +impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> { + type Output = Variant<'m, 'v>; + + fn index(&self, _index: usize) -> &Self::Output { Review Comment: I agree --that will likely become obvious if we try to implement it ########## parquet-variant/src/decoder.rs: ########## @@ -0,0 +1,199 @@ +// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 +// And the feedback there. +use crate::variant::VariantType; +use arrow_schema::ArrowError; +use std::{array::TryFromSliceError, str}; + +#[derive(Debug, Clone, Copy)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + +#[derive(Debug, Clone, Copy)] +pub enum VariantPrimitiveType { + Null = 0, + BooleanTrue = 1, + BooleanFalse = 2, + Int8 = 3, + // TODO: Add 'legs' for the rest of primitives, once API is agreed upon + String = 16, +} + +/// Extracts the basic type from a header byte +pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits + let basic_type = match basic_type { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown basic type: {}", + basic_type + ))) + } + }; + Ok(basic_type) +} + +/// Extracts the primitive type from a header byte +pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + //// Primitive type is encoded in the last 6 bits of the header byte + let primitive_type = (header >> 2) & 0x3F; + let primitive_type = match primitive_type { + 0 => VariantPrimitiveType::Null, + 1 => VariantPrimitiveType::BooleanTrue, + 2 => VariantPrimitiveType::BooleanFalse, + 3 => VariantPrimitiveType::Int8, + // TODO: Add 'legs' for the rest, once API is agreed upon + 16 => VariantPrimitiveType::String, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown primitive type: {}", + primitive_type + ))) + } + }; + Ok(primitive_type) +} + +/// Extracts the variant type from the value section of a variant. The variant +/// type is defined as the set of all basic types and all primitive types. +pub fn get_variant_type(value: &[u8]) -> Result<VariantType, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to get variant type from empty buffer array".to_string(), + )); + } + let header = value[0]; + let variant_type = match get_basic_type(header)? { + VariantBasicType::Primitive => match get_primitive_type(header)? { + VariantPrimitiveType::Null => VariantType::Null, + VariantPrimitiveType::Int8 => VariantType::Int8, + VariantPrimitiveType::BooleanTrue => VariantType::BooleanTrue, + VariantPrimitiveType::BooleanFalse => VariantType::BooleanFalse, + // TODO: Add 'legs' for the rest, once API is agreed upon + VariantPrimitiveType::String => VariantType::String, + }, + VariantBasicType::ShortString => VariantType::ShortString, + VariantBasicType::Object => VariantType::Object, + VariantBasicType::Array => VariantType::Array, + }; + Ok(variant_type) +} + +/// To be used in `map_err` when unpacking an integer from a slice of bytes. +fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { + ArrowError::InvalidArgumentError(e.to_string()) +} Review Comment: I think it would be better actually to improve this conversion a bit to explain the error comes from trying to parse variants. If we just add a blanket `From` impl the code will just work, but users will see messages like `Invalid slice` which will be pretty unspecific ########## parquet-variant/src/variant.rs: ########## @@ -0,0 +1,415 @@ +use crate::decoder::{ + self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, +}; +use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, slice_from_slice}; +use arrow_schema::ArrowError; +use std::{ + num::TryFromIntError, + ops::{Index, Range}, + str, +}; + +#[derive(Clone, Debug, Copy, PartialEq)] +enum OffsetSizeBytes { + One = 1, + Two = 2, + Three = 3, + Four = 4, +} + +impl OffsetSizeBytes { + fn try_new(offset_size_minus_one: u8) -> Result<Self, ArrowError> { + use OffsetSizeBytes::*; + let result = match offset_size_minus_one { + 0 => One, + 1 => Two, + 2 => Three, + 3 => Four, + _ => { + return Err(ArrowError::InvalidArgumentError( + "offset_size_minus_one must be 0–3".to_string(), + )) + } + }; + Ok(result) + } + + fn unpack_usize( + &self, + bytes: &[u8], + byte_offset: usize, // how many bytes to skip + offset_index: usize, // which offset in an array of offsets + ) -> Result<usize, ArrowError> { + use OffsetSizeBytes::*; + let offset = byte_offset + (*self as usize) * offset_index; + let result = match self { + One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(), + Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(), + Three => todo!(), // ugh, endianness + Four => u32::from_le_bytes(array_from_slice(bytes, offset)?) + .try_into() + .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?, + }; + Ok(result) + } +} + +#[derive(Clone, Debug, Copy, PartialEq)] +pub(crate) struct VariantMetadataHeader { + version: u8, + is_sorted: bool, + /// Note: This is `offset_size_minus_one` + 1 + offset_size: OffsetSizeBytes, +} + +impl<'m> VariantMetadataHeader { + /// Tries to construct the variant metadata header, which has the form + /// 7 6 5 4 3 0 + /// +-------+---+---+---------------+ + /// header | | | | version | + /// +-------+---+---+---------------+ + /// ^ ^ + /// | +-- sorted_strings + /// +-- offset_size_minus_one + /// The version is a 4-bit value that must always contain the value 1. + /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. + /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. + /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 + pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> { + let Some(header) = bytes.get(0) else { + return Err(ArrowError::InvalidArgumentError( + "Received zero bytes".to_string(), + )); + }; + + let version = header & 0x0F; // First four bits + let is_sorted = (header & 0x10) != 0; // Fifth bit + let offset_size_minus_one = (header >> 6) & 0x03; // Last two bits + Ok(Self { + version, + is_sorted, + offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?, + }) + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +/// Encodes the Variant Metadata, see the Variant spec file for more information +pub struct VariantMetadata<'m> { + bytes: &'m [u8], + header: VariantMetadataHeader, + dict_size: usize, +} + +impl<'m> VariantMetadata<'m> { + /// View the raw bytes (needed by very low-level decoders) + #[inline] + pub const fn as_bytes(&self) -> &'m [u8] { + self.bytes + } + + pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> { + let header = VariantMetadataHeader::try_new(bytes)?; + // Offset 1, index 0 because first element after header is dictionary size + let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?; + + // TODO: Refactor, add test for validation + let valid = (0..=dict_size) + .map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)) + .scan(0, |prev, cur| { + let Ok(cur_offset) = cur else { + return Some(false); + }; + // Skip the first offset, which is always 0 + if *prev == 0 { + *prev = cur_offset; + return Some(true); + } + + let valid = cur_offset > *prev; + *prev = cur_offset; + Some(valid) + }) + .all(|valid| valid); + + if !valid { + return Err(ArrowError::InvalidArgumentError( + "Offsets are not monotonically increasing".to_string(), + )); + } + Ok(Self { + bytes, + header, + dict_size, + }) + } + + /// Whether the dictionary keys are sorted and unique + pub fn is_sorted(&self) -> bool { + self.header.is_sorted + } + + /// Get the dictionary size + pub fn dictionary_size(&self) -> usize { + self.dict_size + } + pub fn version(&self) -> usize { + todo!() + } + + /// Get the offset by key-index + pub fn get_offset_by(&self, index: usize) -> Result<Range<usize>, ArrowError> { + // TODO: Should we memoize the offsets? There could be thousands of them (https://github.com/apache/arrow-rs/pull/7535#discussion_r2101351294) Review Comment: I don't understand what we would memoize. It seems like this code is pretty efficient to me as it does some bit manipulation and finds the correct offset If the bit manipulation is too much, the only way I can think of being faster is templating `VariantMetadata` on the size or something: ```rust struct VariantMetadata<'m, OffsetSizBytes> { ... } ``` But that will complicate the implementation of Object (which will now have to know what offset size it is) Maybe we can wait for some benchmarks before making it more complicated ########## parquet-variant/src/decoder.rs: ########## @@ -0,0 +1,199 @@ +// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 +// And the feedback there. +use crate::variant::VariantType; +use arrow_schema::ArrowError; +use std::{array::TryFromSliceError, str}; + +#[derive(Debug, Clone, Copy)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + +#[derive(Debug, Clone, Copy)] +pub enum VariantPrimitiveType { + Null = 0, + BooleanTrue = 1, + BooleanFalse = 2, + Int8 = 3, + // TODO: Add 'legs' for the rest of primitives, once API is agreed upon + String = 16, +} + +/// Extracts the basic type from a header byte +pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits + let basic_type = match basic_type { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown basic type: {}", + basic_type + ))) + } Review Comment: if we use unreachable it would be nice to add a comment pointing out that a 2 bit value can only have 4 values (which is why it is provably unreachable, I think) ########## parquet-variant/src/variant.rs: ########## @@ -0,0 +1,321 @@ +use std::{borrow::Cow, ops::Index}; + +use crate::decoder::{self, get_variant_type}; +use arrow_schema::ArrowError; +use strum_macros::EnumDiscriminants; + +#[derive(Clone, Copy, Debug, PartialEq)] +/// Encodes the Variant Metadata, see the Variant spec file for more information +pub struct VariantMetadata<'m> { + bytes: &'m [u8], +} + +impl<'m> VariantMetadata<'m> { + /// View the raw bytes (needed by very low-level decoders) + #[inline] + pub const fn as_bytes(&self) -> &'m [u8] { + self.bytes + } + + /// Whether the dictionary keys are sorted and unique + pub fn is_sorted(&self) -> bool { + todo!() + } + + /// Get the dict length + pub fn dict_len(&self) -> Result<usize, ArrowError> { + let end_location = self.offset_size()? as usize + 1; + if self.bytes.len() < end_location { + let err_str = format!( + "Invalid metadata bytes, must have at least length {} but has length {}", + &end_location, + self.bytes.len() + ); + return Err(ArrowError::InvalidArgumentError(err_str)); + } + let dict_len_bytes = &self.bytes[1..end_location]; + let dict_len = usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Unable to convert dictionary_size bytes into usize: {}", + e, + )) + })?); + Ok(dict_len) + } + pub fn version(&self) -> usize { + todo!() + } + + /// Get the offset by index + pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> { + todo!() + } + + /// Get the header byte, which has the following form + /// 7 6 5 4 3 0 + /// +-------+---+---+---------------+ + /// header | | | | version | + /// +-------+---+---+---------------+ + /// ^ ^ + /// | +-- sorted_strings + /// +-- offset_size_minus_one + /// The version is a 4-bit value that must always contain the value 1. + /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. + /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. + /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 + pub fn header(&self) -> Result<u8, ArrowError> { + if self.bytes.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Can't get header from empty buffer".to_string(), + )); + } + Ok(self.bytes[0]) + } + + /// Get the offset_minus_one value from the header + pub fn offset_size_minus_one(&self) -> Result<u8, ArrowError> { + if self.bytes.is_empty() { + Err(ArrowError::InvalidArgumentError( + "Tried to get offset_size_minus_one from header, but self.bytes buffer is emtpy." + .to_string(), + )) + } else { + Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits + } + } + + /// Get the offset_size + pub fn offset_size(&self) -> Result<u8, ArrowError> { + Ok(self.offset_size_minus_one()? + 1) + } + + /// Get the offsets as an iterator + // TODO: Do we want this kind of API? + // TODO: Test once API is agreed upon + pub fn offsets(&'m self) -> Result<impl Iterator<Item = (usize, usize)> + 'm, ArrowError> { + struct OffsetIterators<'m> { + buffer: &'m [u8], + total: usize, + seen: usize, + offset_size: usize, + } + impl<'m> Iterator for OffsetIterators<'m> { + type Item = (usize, usize); // (start, end) positions of the bytes + + // TODO: Check bounds here or ensure they're correct + + fn next(&mut self) -> Option<Self::Item> { + // +1 to skip the first offset + if self.seen < self.total { + let start = usize::from_le_bytes( + self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header + ..(self.seen ) * self.offset_size + 1] + .try_into() + .ok()?, + ); + self.seen += 1; + let end = usize::from_le_bytes( + self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header + ..(self.seen ) * self.offset_size + 1] + .try_into() + .ok()?, + ); + + Some((start, end)) + } else { + None + } + } + } + let iterator: OffsetIterators = OffsetIterators { + buffer: self.bytes, + total: self.dict_len()?, + seen: 0, + offset_size: self.offset_size()? as usize, + }; + Ok(iterator) + } + /// Get the key-name-bytes by index + pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> { + todo!() + } + /// Get all key-names as an Iterator of strings + // TODO: Result + pub fn fields(&self) -> impl Iterator<Item = &'m str> { + // Do the same as for offsets + todo!(); + vec![].into_iter() + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VariantObject<'m, 'v> { + pub metadata: VariantMetadata<'m>, + pub value: &'v [u8], +} + +impl<'m, 'v> VariantObject<'m, 'v> { + pub fn fields(&self) -> Result<impl Iterator<Item = (&'m str, Variant<'m, 'v>)>, ArrowError> { + todo!(); + #[allow(unreachable_code)] // Just to infer the return type + Ok(vec![].into_iter()) + } + + pub fn field(&self, _name: &'m str) -> Result<Variant<'m, 'v>, ArrowError> { + todo!() + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VariantArray<'m, 'v> { + pub metadata: VariantMetadata<'m>, + pub value: &'v [u8], +} + +// TODO: Let's agree on the API here, also should we expose a way to get the values as a vec of +// variants for those who want it? Would require allocations. +impl<'m, 'v> VariantArray<'m, 'v> { + pub fn len(&self) -> usize { + todo!() + } + + pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, ArrowError> { + todo!(); + #[allow(unreachable_code)] // Just to infer the return type + Ok(vec![].into_iter()) + } +} + +impl<'m, 'v> Index<usize> for VariantArray<'m, 'v> { + type Output = Variant<'m, 'v>; + + fn index(&self, _index: usize) -> &Self::Output { + todo!() + } +} + +/// Variant value. May contain references to metadata and value +// TODO: Add copy if no Cow on String and Shortstring? +#[derive(Clone, Debug, PartialEq, EnumDiscriminants)] +#[strum_discriminants(name(VariantType))] +pub enum Variant<'m, 'v> { + // TODO: Add 'legs' for the rest of the primitive types, once API is agreed upon + Null, + Int8(i8), + + BooleanTrue, + BooleanFalse, + + // only need the *value* buffer + // TODO: Do we want Cow<'v, str> over &'v str? It enables From<String> - discuss on PR + String(Cow<'v, str>), + ShortString(Cow<'v, str>), + + // need both metadata & value + Object(VariantObject<'m, 'v>), + Array(VariantArray<'m, 'v>), +} + +impl<'m, 'v> Variant<'m, 'v> { + /// Parse the buffers and return the appropriate variant. + pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result<Self, ArrowError> { + Ok(match get_variant_type(value)? { + VariantType::Null => Variant::Null, + VariantType::BooleanTrue => Variant::BooleanTrue, + VariantType::BooleanFalse => Variant::BooleanFalse, + + VariantType::Int8 => Variant::Int8(decoder::decode_int8(value)?), + + // TODO: Add 'legs' for the rest of the primitive types, once API is agreed upon + VariantType::String => { + Variant::String(Cow::Borrowed(decoder::decode_long_string(value)?)) + } + + VariantType::ShortString => { + Variant::ShortString(Cow::Borrowed(decoder::decode_short_string(value)?)) + } + + VariantType::Object => Variant::Object(VariantObject { + metadata: VariantMetadata { bytes: metadata }, + value, + }), + VariantType::Array => Variant::Array(VariantArray { + metadata: VariantMetadata { bytes: metadata }, + value, + }), + }) + } + + pub fn as_null(&self) -> Option<()> { + match self { + Variant::Null => Some(()), + _ => None, + } + } + + pub fn as_boolean(&self) -> Option<bool> { + match self { + Variant::BooleanTrue => Some(true), + Variant::BooleanFalse => Some(false), + _ => None, + } + } + + pub fn as_string(&'v self) -> Option<&'v str> { + match self { + Variant::String(s) | Variant::ShortString(s) => Some(s), + _ => None, + } + } + + pub fn as_int8(&self) -> Option<i8> { + match self { + Variant::Int8(i) => Some(*i), Review Comment: makes sense to me ########## parquet-variant/src/variant.rs: ########## @@ -0,0 +1,321 @@ +use std::{borrow::Cow, ops::Index}; + +use crate::decoder::{self, get_variant_type}; +use arrow_schema::ArrowError; +use strum_macros::EnumDiscriminants; + +#[derive(Clone, Copy, Debug, PartialEq)] +/// Encodes the Variant Metadata, see the Variant spec file for more information Review Comment: > Actually, there are four different XX_size_minus_one situations in the variant spec (metadata dictionary offsets, array value offsets, object field ids, and object value offsets). We should probably factor out a helper class for working with them, since it's ~always the same: If we really want to obsess about performance, we could potentially make `VariantMetadata` templated on the offset size. I am not sure how much that will matter so we probably shouldn't do it at this point -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
