scovich commented on code in PR #7452: URL: https://github.com/apache/arrow-rs/pull/7452#discussion_r2098519833
########## arrow-variant/src/variant.rs: ########## @@ -0,0 +1,418 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Core Variant data type for working with the Arrow Variant binary format. + +use crate::decoder; +use arrow_schema::ArrowError; +use std::fmt; + +/// A Variant value in the Arrow binary format +#[derive(Debug, Clone, PartialEq)] +pub struct Variant<'a> { + /// Raw metadata bytes + metadata: &'a [u8], + /// Raw value bytes + value: &'a [u8], +} + +impl<'a> Variant<'a> { + /// Creates a new Variant with metadata and value bytes + pub fn new(metadata: &'a [u8], value: &'a [u8]) -> Self { + Self { metadata, value } + } + + /// Creates a Variant by parsing binary metadata and value + pub fn try_new(metadata: &'a [u8], value: &'a [u8]) -> Result<Self, ArrowError> { + // Validate that the binary data is a valid Variant + decoder::validate_variant(value, metadata)?; + + Ok(Self { metadata, value }) + } + + /// Returns the raw metadata bytes + pub fn metadata(&self) -> &'a [u8] { + self.metadata + } + + /// Returns the raw value bytes + pub fn value(&self) -> &'a [u8] { + self.value + } + + /// Gets a value by key from an object Variant + /// + /// Returns: + /// - `Ok(Some(Variant))` if the key exists + /// - `Ok(None)` if the key doesn't exist or the Variant is not an object + /// - `Err` if there was an error parsing the Variant + pub fn get(&self, key: &str) -> Result<Option<Variant<'a>>, ArrowError> { + let result = decoder::get_field_value_range(self.value, self.metadata, key)?; + Ok(result.map(|(start, end)| Variant { + metadata: self.metadata, // Share the same metadata reference + value: &self.value[start..end], // Use a slice of the original value buffer + })) + } + + /// Gets a value by index from an array Variant + /// + /// Returns: + /// - `Ok(Some(Variant))` if the index is valid + /// - `Ok(None)` if the index is out of bounds or the Variant is not an array + /// - `Err` if there was an error parsing the Variant + pub fn get_index(&self, index: usize) -> Result<Option<Variant<'a>>, ArrowError> { + let result = decoder::get_array_element_range(self.value, index)?; + Ok(result.map(|(start, end)| Variant { + metadata: self.metadata, // Share the same metadata reference + value: &self.value[start..end], // Use a slice of the original value buffer + })) + } + + /// Checks if this Variant is an object + pub fn is_object(&self) -> Result<bool, ArrowError> { + decoder::is_object(self.value) + } + + /// Checks if this Variant is an array + pub fn is_array(&self) -> Result<bool, ArrowError> { + decoder::is_array(self.value) + } + + /// Converts the variant value to a serde_json::Value + pub fn as_value(&self) -> Result<serde_json::Value, ArrowError> { + let keys = crate::decoder::parse_metadata_keys(self.metadata)?; + crate::decoder::decode_value(self.value, &keys) + } + + /// Converts the variant value to a string. + pub fn as_string(&self) -> Result<String, ArrowError> { + match self.as_value()? { + serde_json::Value::String(s) => Ok(s), + serde_json::Value::Number(n) => Ok(n.to_string()), + serde_json::Value::Bool(b) => Ok(b.to_string()), + serde_json::Value::Null => Ok("null".to_string()), + _ => Err(ArrowError::InvalidArgumentError( + "Cannot convert value to string".to_string(), + )), + } + } + + /// Converts the variant value to a i32. + pub fn as_i32(&self) -> Result<i32, ArrowError> { + match self.as_value()? { + serde_json::Value::Number(n) => { + if let Some(i) = n.as_i64() { + if i >= i32::MIN as i64 && i <= i32::MAX as i64 { + return Ok(i as i32); + } + } + Err(ArrowError::InvalidArgumentError( + "Number outside i32 range".to_string(), + )) + } + _ => Err(ArrowError::InvalidArgumentError( + "Cannot convert value to i32".to_string(), + )), + } + } + + /// Converts the variant value to a i64. + pub fn as_i64(&self) -> Result<i64, ArrowError> { + match self.as_value()? { + serde_json::Value::Number(n) => { + if let Some(i) = n.as_i64() { + return Ok(i); + } + Err(ArrowError::InvalidArgumentError( + "Number cannot be represented as i64".to_string(), + )) + } + _ => Err(ArrowError::InvalidArgumentError( + "Cannot convert value to i64".to_string(), + )), + } + } + + /// Converts the variant value to a bool. + pub fn as_bool(&self) -> Result<bool, ArrowError> { + match self.as_value()? { + serde_json::Value::Bool(b) => Ok(b), + serde_json::Value::Number(n) => { + if let Some(i) = n.as_i64() { + return Ok(i != 0); + } + if let Some(f) = n.as_f64() { + return Ok(f != 0.0); + } + Err(ArrowError::InvalidArgumentError( + "Cannot convert number to bool".to_string(), + )) + } + serde_json::Value::String(s) => match s.to_lowercase().as_str() { + "true" | "yes" | "1" => Ok(true), + "false" | "no" | "0" => Ok(false), + _ => Err(ArrowError::InvalidArgumentError( + "Cannot convert string to bool".to_string(), + )), + }, + _ => Err(ArrowError::InvalidArgumentError( + "Cannot convert value to bool".to_string(), + )), + } + } + + /// Converts the variant value to a f64. + pub fn as_f64(&self) -> Result<f64, ArrowError> { Review Comment: Question about casting/converting values -- what semantics do we want when requesting a specific type such as f64? From strictest to loosest: * type extraction - only return values that were actually encoded as variant Double * type widening casts - additionally convert and return values of narrower types (e.g. variant Float or Int8) * value widening casts - additionally convert and narrower values of wider types (e.g. f64 can exactly represent `Int64(1)`, but not `Int64(18014398509481985)`) * narrowing casts - additionally convert wider values of wider types, with information loss (e.g. `Int64(18014398509481985)` becomes `1.8014398509481984e+16`) * converting casts - additionally attempt to convert/parse values of unrelated types (e.g. `ShortString("10")` becomes `Double(10.0)`). Using an `enum Variant` already allows extraction by matching. Narrowing and converting casts seem a bit too dangerous/unpredictable to build in at such a low level. That leaves type widening and value widening. Type widening is pretty cheap and convenient; value widening is even more convenient but not as cheap (requires branching, because some values cannot be converted while others can). This PR is currently using value widening, which arguably matches JSON parsing most closely given that JSON only defines one numeric type. The [RFC](https://datatracker.ietf.org/doc/html/rfc7159#section-6) actually recommends treating all numbers as IEEE 754 double, which includes limiting integer precision to 54 bit signed... but we can do a bit better using a combination of Int64, Decimal16, and Double). Do others have thoughts/opinions here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org