scovich commented on code in PR #7452:
URL: https://github.com/apache/arrow-rs/pull/7452#discussion_r2098519833


##########
arrow-variant/src/variant.rs:
##########
@@ -0,0 +1,418 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Core Variant data type for working with the Arrow Variant binary format.
+
+use crate::decoder;
+use arrow_schema::ArrowError;
+use std::fmt;
+
+/// A Variant value in the Arrow binary format
+#[derive(Debug, Clone, PartialEq)]
+pub struct Variant<'a> {
+    /// Raw metadata bytes
+    metadata: &'a [u8],
+    /// Raw value bytes
+    value: &'a [u8],
+}
+
+impl<'a> Variant<'a> {
+    /// Creates a new Variant with metadata and value bytes
+    pub fn new(metadata: &'a [u8], value: &'a [u8]) -> Self {
+        Self { metadata, value }
+    }
+
+    /// Creates a Variant by parsing binary metadata and value
+    pub fn try_new(metadata: &'a [u8], value: &'a [u8]) -> Result<Self, 
ArrowError> {
+        // Validate that the binary data is a valid Variant
+        decoder::validate_variant(value, metadata)?;
+
+        Ok(Self { metadata, value })
+    }
+
+    /// Returns the raw metadata bytes
+    pub fn metadata(&self) -> &'a [u8] {
+        self.metadata
+    }
+
+    /// Returns the raw value bytes
+    pub fn value(&self) -> &'a [u8] {
+        self.value
+    }
+
+    /// Gets a value by key from an object Variant
+    ///
+    /// Returns:
+    /// - `Ok(Some(Variant))` if the key exists
+    /// - `Ok(None)` if the key doesn't exist or the Variant is not an object
+    /// - `Err` if there was an error parsing the Variant
+    pub fn get(&self, key: &str) -> Result<Option<Variant<'a>>, ArrowError> {
+        let result = decoder::get_field_value_range(self.value, self.metadata, 
key)?;
+        Ok(result.map(|(start, end)| Variant {
+            metadata: self.metadata,        // Share the same metadata 
reference
+            value: &self.value[start..end], // Use a slice of the original 
value buffer
+        }))
+    }
+
+    /// Gets a value by index from an array Variant
+    ///
+    /// Returns:
+    /// - `Ok(Some(Variant))` if the index is valid
+    /// - `Ok(None)` if the index is out of bounds or the Variant is not an 
array
+    /// - `Err` if there was an error parsing the Variant
+    pub fn get_index(&self, index: usize) -> Result<Option<Variant<'a>>, 
ArrowError> {
+        let result = decoder::get_array_element_range(self.value, index)?;
+        Ok(result.map(|(start, end)| Variant {
+            metadata: self.metadata,        // Share the same metadata 
reference
+            value: &self.value[start..end], // Use a slice of the original 
value buffer
+        }))
+    }
+
+    /// Checks if this Variant is an object
+    pub fn is_object(&self) -> Result<bool, ArrowError> {
+        decoder::is_object(self.value)
+    }
+
+    /// Checks if this Variant is an array
+    pub fn is_array(&self) -> Result<bool, ArrowError> {
+        decoder::is_array(self.value)
+    }
+
+    /// Converts the variant value to a serde_json::Value
+    pub fn as_value(&self) -> Result<serde_json::Value, ArrowError> {
+        let keys = crate::decoder::parse_metadata_keys(self.metadata)?;
+        crate::decoder::decode_value(self.value, &keys)
+    }
+
+    /// Converts the variant value to a string.
+    pub fn as_string(&self) -> Result<String, ArrowError> {
+        match self.as_value()? {
+            serde_json::Value::String(s) => Ok(s),
+            serde_json::Value::Number(n) => Ok(n.to_string()),
+            serde_json::Value::Bool(b) => Ok(b.to_string()),
+            serde_json::Value::Null => Ok("null".to_string()),
+            _ => Err(ArrowError::InvalidArgumentError(
+                "Cannot convert value to string".to_string(),
+            )),
+        }
+    }
+
+    /// Converts the variant value to a i32.
+    pub fn as_i32(&self) -> Result<i32, ArrowError> {
+        match self.as_value()? {
+            serde_json::Value::Number(n) => {
+                if let Some(i) = n.as_i64() {
+                    if i >= i32::MIN as i64 && i <= i32::MAX as i64 {
+                        return Ok(i as i32);
+                    }
+                }
+                Err(ArrowError::InvalidArgumentError(
+                    "Number outside i32 range".to_string(),
+                ))
+            }
+            _ => Err(ArrowError::InvalidArgumentError(
+                "Cannot convert value to i32".to_string(),
+            )),
+        }
+    }
+
+    /// Converts the variant value to a i64.
+    pub fn as_i64(&self) -> Result<i64, ArrowError> {
+        match self.as_value()? {
+            serde_json::Value::Number(n) => {
+                if let Some(i) = n.as_i64() {
+                    return Ok(i);
+                }
+                Err(ArrowError::InvalidArgumentError(
+                    "Number cannot be represented as i64".to_string(),
+                ))
+            }
+            _ => Err(ArrowError::InvalidArgumentError(
+                "Cannot convert value to i64".to_string(),
+            )),
+        }
+    }
+
+    /// Converts the variant value to a bool.
+    pub fn as_bool(&self) -> Result<bool, ArrowError> {
+        match self.as_value()? {
+            serde_json::Value::Bool(b) => Ok(b),
+            serde_json::Value::Number(n) => {
+                if let Some(i) = n.as_i64() {
+                    return Ok(i != 0);
+                }
+                if let Some(f) = n.as_f64() {
+                    return Ok(f != 0.0);
+                }
+                Err(ArrowError::InvalidArgumentError(
+                    "Cannot convert number to bool".to_string(),
+                ))
+            }
+            serde_json::Value::String(s) => match s.to_lowercase().as_str() {
+                "true" | "yes" | "1" => Ok(true),
+                "false" | "no" | "0" => Ok(false),
+                _ => Err(ArrowError::InvalidArgumentError(
+                    "Cannot convert string to bool".to_string(),
+                )),
+            },
+            _ => Err(ArrowError::InvalidArgumentError(
+                "Cannot convert value to bool".to_string(),
+            )),
+        }
+    }
+
+    /// Converts the variant value to a f64.
+    pub fn as_f64(&self) -> Result<f64, ArrowError> {

Review Comment:
   Question about casting/converting values -- what semantics do we want when 
requesting a specific type such as f64? From strictest to loosest:
   * type extraction - only return values that were actually encoded as variant 
Double
   * type widening casts - additionally convert and return values of narrower 
types (e.g. variant Float or Int8)
   * value widening casts - additionally convert and narrower values of wider 
types (e.g. f64 can exactly represent `Int64(1)`, but not 
`Int64(18014398509481985)`)
   * narrowing casts - additionally convert wider values of wider types, with 
information loss (e.g. `Int64(18014398509481985)` becomes 
`1.8014398509481984e+16`)
   * converting casts - additionally attempt to convert/parse values of 
unrelated types (e.g. `ShortString("10")` becomes `Double(10.0)`).
   
   Using an `enum Variant` already allows extraction by matching. Narrowing and 
converting casts seem a bit too dangerous/unpredictable to build in at such a 
low level. That leaves type widening and value widening. Type widening is 
pretty cheap and convenient; value widening is even more convenient but not as 
cheap (requires branching, because some values cannot be converted while others 
can). 
   
   This PR is currently using value widening, which arguably matches JSON 
parsing most closely given that JSON only defines one numeric type. The  
[RFC](https://datatracker.ietf.org/doc/html/rfc7159#section-6) actually 
recommends treating all numbers as IEEE 754 double, which includes limiting 
integer precision to 54 bit signed... but we can do a bit better using a 
combination of Int64, Decimal16, and Double).
   
   Do others have thoughts/opinions here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to