This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 4ac3114500 [Variant] Add primitive type timestamp_nanos(with&without
timezone) and uuid (#8149)
4ac3114500 is described below
commit 4ac31145005e480f5dce624b5100ab0ab8a260cd
Author: Congxian Qiu <[email protected]>
AuthorDate: Thu Aug 21 02:22:54 2025 +0800
[Variant] Add primitive type timestamp_nanos(with&without timezone) and
uuid (#8149)
# Which issue does this PR close?
- Closes #8126.
# Rationale for this change
This PR adds remaining variant primitive
types(timestamp_nanos/timestampntz_nanos/uuid)
# What changes are included in this PR?
- Add primitive variant types for
timestamp_nanos/timestampntz_nanos/uuid
# Are these changes tested?
Added some tests and reusing existing tests
# Are there any user-facing changes?
No
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet-variant-compute/src/cast_to_variant.rs | 14 ++-
parquet-variant-json/Cargo.toml | 1 +
parquet-variant-json/src/to_json.rs | 74 ++++++++++++++--
parquet-variant/Cargo.toml | 1 +
parquet-variant/src/builder.rs | 24 ++++++
parquet-variant/src/decoder.rs | 81 ++++++++++++++++++
parquet-variant/src/variant.rs | 113 +++++++++++++++++++++----
parquet-variant/tests/variant_interop.rs | 4 +
8 files changed, 286 insertions(+), 26 deletions(-)
diff --git a/parquet-variant-compute/src/cast_to_variant.rs
b/parquet-variant-compute/src/cast_to_variant.rs
index 926a4d4efc..7eeb4da632 100644
--- a/parquet-variant-compute/src/cast_to_variant.rs
+++ b/parquet-variant-compute/src/cast_to_variant.rs
@@ -591,13 +591,19 @@ mod tests {
Arc::new(microsecond_array.with_timezone("+01:00".to_string())),
);
- // nanoseconds should get truncated to microseconds
+ let timestamp = DateTime::from_timestamp_nanos(nanosecond);
let nanosecond_array =
TimestampNanosecondArray::from(vec![Some(nanosecond), None]);
- run_array_tests(
- microsecond,
+ run_test(
Arc::new(nanosecond_array.clone()),
+ vec![
+ Some(Variant::TimestampNtzNanos(timestamp.naive_utc())),
+ None,
+ ],
+ );
+ run_test(
Arc::new(nanosecond_array.with_timezone("+01:00".to_string())),
- )
+ vec![Some(Variant::TimestampNanos(timestamp)), None],
+ );
}
#[test]
diff --git a/parquet-variant-json/Cargo.toml b/parquet-variant-json/Cargo.toml
index 76255f0681..5d8e02546b 100644
--- a/parquet-variant-json/Cargo.toml
+++ b/parquet-variant-json/Cargo.toml
@@ -37,6 +37,7 @@ parquet-variant = { path = "../parquet-variant" }
chrono = { workspace = true }
serde_json = "1.0"
base64 = "0.22"
+uuid = "1.18.0"
[lib]
diff --git a/parquet-variant-json/src/to_json.rs
b/parquet-variant-json/src/to_json.rs
index 4753d6cc96..b1894a64f8 100644
--- a/parquet-variant-json/src/to_json.rs
+++ b/parquet-variant-json/src/to_json.rs
@@ -181,9 +181,14 @@ impl<'m, 'v> VariantToJson for Variant<'m, 'v> {
Variant::Decimal8(decimal) => write!(buffer, "{decimal}")?,
Variant::Decimal16(decimal) => write!(buffer, "{decimal}")?,
Variant::Date(date) => write!(buffer, "\"{}\"",
format_date_string(date))?,
- Variant::TimestampMicros(ts) => write!(buffer, "\"{}\"",
ts.to_rfc3339())?,
+ Variant::TimestampMicros(ts) | Variant::TimestampNanos(ts) => {
+ write!(buffer, "\"{}\"", ts.to_rfc3339())?
+ }
Variant::TimestampNtzMicros(ts) => {
- write!(buffer, "\"{}\"", format_timestamp_ntz_string(ts))?
+ write!(buffer, "\"{}\"", format_timestamp_ntz_string(ts, 6))?
+ }
+ Variant::TimestampNtzNanos(ts) => {
+ write!(buffer, "\"{}\"", format_timestamp_ntz_string(ts, 9))?
}
Variant::Time(time) => write!(buffer, "\"{}\"",
format_time_ntz_str(time))?,
Variant::Binary(bytes) => {
@@ -208,6 +213,9 @@ impl<'m, 'v> VariantToJson for Variant<'m, 'v> {
})?;
write!(buffer, "{json_str}")?
}
+ Variant::Uuid(uuid) => {
+ write!(buffer, "\"{uuid}\"")?;
+ }
Variant::Object(obj) => {
convert_object_to_json(buffer, obj)?;
}
@@ -297,12 +305,18 @@ impl<'m, 'v> VariantToJson for Variant<'m, 'v> {
Ok(value)
}
Variant::Date(date) => Ok(Value::String(format_date_string(date))),
- Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())),
- Variant::TimestampNtzMicros(ts) =>
Ok(Value::String(format_timestamp_ntz_string(ts))),
+ Variant::TimestampMicros(ts) | Variant::TimestampNanos(ts) => {
+ Ok(Value::String(ts.to_rfc3339()))
+ }
+ Variant::TimestampNtzMicros(ts) => {
+ Ok(Value::String(format_timestamp_ntz_string(ts, 6)))
+ }
+ Variant::TimestampNtzNanos(ts) =>
Ok(Value::String(format_timestamp_ntz_string(ts, 9))),
Variant::Time(time) =>
Ok(Value::String(format_time_ntz_str(time))),
Variant::Binary(bytes) =>
Ok(Value::String(format_binary_base64(bytes))),
Variant::String(s) => Ok(Value::String(s.to_string())),
Variant::ShortString(s) => Ok(Value::String(s.to_string())),
+ Variant::Uuid(uuid) => Ok(Value::String(uuid.to_string())),
Variant::Object(obj) => {
let map = obj
.iter()
@@ -323,15 +337,18 @@ impl<'m, 'v> VariantToJson for Variant<'m, 'v> {
// Format string constants to avoid duplication and reduce errors
const DATE_FORMAT: &str = "%Y-%m-%d";
-const TIMESTAMP_NTZ_FORMAT: &str = "%Y-%m-%dT%H:%M:%S%.6f";
// Helper functions for consistent formatting
fn format_date_string(date: &chrono::NaiveDate) -> String {
date.format(DATE_FORMAT).to_string()
}
-fn format_timestamp_ntz_string(ts: &chrono::NaiveDateTime) -> String {
- ts.format(TIMESTAMP_NTZ_FORMAT).to_string()
+fn format_timestamp_ntz_string(ts: &chrono::NaiveDateTime, precision: usize)
-> String {
+ let format_str = format!(
+ "{}",
+ ts.format(&format!("%Y-%m-%dT%H:%M:%S%.{}f", precision))
+ );
+ ts.format(format_str.as_str()).to_string()
}
fn format_binary_base64(bytes: &[u8]) -> String {
@@ -497,6 +514,34 @@ mod tests {
Ok(())
}
+ #[test]
+ fn test_timestamp_nanos_to_json() -> Result<(), ArrowError> {
+ let timestamp =
DateTime::parse_from_rfc3339("2023-12-25T10:30:45.123456789Z")
+ .unwrap()
+ .with_timezone(&Utc);
+ let variant = Variant::TimestampNanos(timestamp);
+ let json = variant.to_json_string()?;
+ assert_eq!(json, "\"2023-12-25T10:30:45.123456789+00:00\"");
+
+ let json_value = variant.to_json_value()?;
+ assert!(matches!(json_value, Value::String(_)));
+ Ok(())
+ }
+
+ #[test]
+ fn test_timestamp_ntz_nanos_to_json() -> Result<(), ArrowError> {
+ let naive_timestamp = DateTime::from_timestamp(1703505045, 123456789)
+ .unwrap()
+ .naive_utc();
+ let variant = Variant::TimestampNtzNanos(naive_timestamp);
+ let json = variant.to_json_string()?;
+ assert_eq!(json, "\"2023-12-25T11:50:45.123456789\"");
+
+ let json_value = variant.to_json_value()?;
+ assert!(matches!(json_value, Value::String(_)));
+ Ok(())
+ }
+
#[test]
fn test_binary_to_json() -> Result<(), ArrowError> {
let binary_data = b"Hello, World!";
@@ -546,6 +591,21 @@ mod tests {
Ok(())
}
+ #[test]
+ fn test_uuid_to_json() -> Result<(), ArrowError> {
+ let uuid =
uuid::Uuid::parse_str("123e4567-e89b-12d3-a456-426614174000").unwrap();
+ let variant = Variant::Uuid(uuid);
+ let json = variant.to_json_string()?;
+ assert_eq!(json, "\"123e4567-e89b-12d3-a456-426614174000\"");
+
+ let json_value = variant.to_json_value()?;
+ assert_eq!(
+ json_value,
+ Value::String("123e4567-e89b-12d3-a456-426614174000".to_string())
+ );
+ Ok(())
+ }
+
#[test]
fn test_string_escaping() -> Result<(), ArrowError> {
let variant = Variant::from("hello\nworld\t\"quoted\"");
diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml
index 51fa4cc233..9e0fa98828 100644
--- a/parquet-variant/Cargo.toml
+++ b/parquet-variant/Cargo.toml
@@ -34,6 +34,7 @@ rust-version = { workspace = true }
arrow-schema = { workspace = true }
chrono = { workspace = true }
indexmap = "2.10.0"
+uuid = { version = "1.18.0"}
simdutf8 = { workspace = true , optional = true }
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index 6ab51ac23e..fe3dd52853 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -23,6 +23,7 @@ use arrow_schema::ArrowError;
use chrono::Timelike;
use indexmap::{IndexMap, IndexSet};
use std::collections::HashSet;
+use uuid::Uuid;
const BASIC_TYPE_BITS: u8 = 2;
const UNIX_EPOCH_DATE: chrono::NaiveDate =
chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
@@ -198,6 +199,23 @@ impl ValueBuffer {
self.append_slice(µs_from_midnight.to_le_bytes());
}
+ fn append_timestamp_nanos(&mut self, value: chrono::DateTime<chrono::Utc>)
{
+ self.append_primitive_header(VariantPrimitiveType::TimestampNanos);
+ let nanos = value.timestamp_nanos_opt().unwrap();
+ self.append_slice(&nanos.to_le_bytes());
+ }
+
+ fn append_timestamp_ntz_nanos(&mut self, value: chrono::NaiveDateTime) {
+ self.append_primitive_header(VariantPrimitiveType::TimestampNtzNanos);
+ let nanos = value.and_utc().timestamp_nanos_opt().unwrap();
+ self.append_slice(&nanos.to_le_bytes());
+ }
+
+ fn append_uuid(&mut self, value: Uuid) {
+ self.append_primitive_header(VariantPrimitiveType::Uuid);
+ self.append_slice(&value.into_bytes());
+ }
+
fn append_decimal4(&mut self, decimal4: VariantDecimal4) {
self.append_primitive_header(VariantPrimitiveType::Decimal4);
self.append_u8(decimal4.scale());
@@ -332,6 +350,8 @@ impl ValueBuffer {
Variant::Date(v) => self.append_date(v),
Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
Variant::TimestampNtzMicros(v) =>
self.append_timestamp_ntz_micros(v),
+ Variant::TimestampNanos(v) => self.append_timestamp_nanos(v),
+ Variant::TimestampNtzNanos(v) =>
self.append_timestamp_ntz_nanos(v),
Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
@@ -340,6 +360,7 @@ impl ValueBuffer {
Variant::Binary(v) => self.append_binary(v),
Variant::String(s) => self.append_string(s),
Variant::ShortString(s) => self.append_short_string(s),
+ Variant::Uuid(v) => self.append_uuid(v),
Variant::Object(obj) => self.append_object(metadata_builder, obj),
Variant::List(list) => self.append_list(metadata_builder, list),
Variant::Time(v) => self.append_time_micros(v),
@@ -363,12 +384,15 @@ impl ValueBuffer {
Variant::Date(v) => self.append_date(v),
Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
Variant::TimestampNtzMicros(v) =>
self.append_timestamp_ntz_micros(v),
+ Variant::TimestampNanos(v) => self.append_timestamp_nanos(v),
+ Variant::TimestampNtzNanos(v) =>
self.append_timestamp_ntz_nanos(v),
Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
Variant::Float(v) => self.append_float(v),
Variant::Double(v) => self.append_double(v),
Variant::Binary(v) => self.append_binary(v),
+ Variant::Uuid(v) => self.append_uuid(v),
Variant::String(s) => self.append_string(s),
Variant::ShortString(s) => self.append_short_string(s),
Variant::Object(obj) => self.try_append_object(metadata_builder,
obj)?,
diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs
index ff870596e4..26b4e204fa 100644
--- a/parquet-variant/src/decoder.rs
+++ b/parquet-variant/src/decoder.rs
@@ -21,6 +21,7 @@ use crate::ShortString;
use arrow_schema::ArrowError;
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, Utc};
+use uuid::Uuid;
/// The basic type of a [`Variant`] value, encoded in the first two bits of the
/// header byte.
@@ -64,6 +65,9 @@ pub enum VariantPrimitiveType {
Binary = 15,
String = 16,
Time = 17,
+ TimestampNanos = 18,
+ TimestampNtzNanos = 19,
+ Uuid = 20,
}
/// Extracts the basic type from a header byte
@@ -106,6 +110,9 @@ impl TryFrom<u8> for VariantPrimitiveType {
15 => Ok(VariantPrimitiveType::Binary),
16 => Ok(VariantPrimitiveType::String),
17 => Ok(VariantPrimitiveType::Time),
+ 18 => Ok(VariantPrimitiveType::TimestampNanos),
+ 19 => Ok(VariantPrimitiveType::TimestampNtzNanos),
+ 20 => Ok(VariantPrimitiveType::Uuid),
_ => Err(ArrowError::InvalidArgumentError(format!(
"unknown primitive type: {value}",
))),
@@ -316,6 +323,25 @@ pub(crate) fn decode_time_ntz(data: &[u8]) ->
Result<NaiveTime, ArrowError> {
.ok_or(case_error)
}
+/// Decodes a TimestampNanos from the value section of a variant.
+pub(crate) fn decode_timestamp_nanos(data: &[u8]) -> Result<DateTime<Utc>,
ArrowError> {
+ let nanos_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
+
+ // DateTime::from_timestamp_nanos would never fail
+ Ok(DateTime::from_timestamp_nanos(nanos_since_epoch))
+}
+
+/// Decodes a TimestampNtzNanos from the value section of a variant.
+pub(crate) fn decode_timestampntz_nanos(data: &[u8]) -> Result<NaiveDateTime,
ArrowError> {
+ decode_timestamp_nanos(data).map(|v| v.naive_utc())
+}
+
+/// Decodes a UUID from the value section of a variant.
+pub(crate) fn decode_uuid(data: &[u8]) -> Result<Uuid, ArrowError> {
+ Uuid::from_slice(&data[0..16])
+ .map_err(|_| ArrowError::CastError(format!("Cant decode uuid from
{:?}", &data[0..16])))
+}
+
/// Decodes a Binary from the value section of a variant.
pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
@@ -460,6 +486,61 @@ mod tests {
.and_hms_milli_opt(16, 34, 56, 780)
.unwrap()
);
+
+ test_decoder_bounds!(
+ test_timestamp_nanos,
+ [0x15, 0x41, 0xa2, 0x5a, 0x36, 0xa2, 0x5b, 0x18],
+ decode_timestamp_nanos,
+ NaiveDate::from_ymd_opt(2025, 8, 14)
+ .unwrap()
+ .and_hms_nano_opt(12, 33, 54, 123456789)
+ .unwrap()
+ .and_utc()
+ );
+
+ test_decoder_bounds!(
+ test_timestamp_nanos_before_epoch,
+ [0x15, 0x41, 0x52, 0xd4, 0x94, 0xe5, 0xad, 0xfa],
+ decode_timestamp_nanos,
+ NaiveDate::from_ymd_opt(1957, 11, 7)
+ .unwrap()
+ .and_hms_nano_opt(12, 33, 54, 123456789)
+ .unwrap()
+ .and_utc()
+ );
+
+ test_decoder_bounds!(
+ test_timestampntz_nanos,
+ [0x15, 0x41, 0xa2, 0x5a, 0x36, 0xa2, 0x5b, 0x18],
+ decode_timestampntz_nanos,
+ NaiveDate::from_ymd_opt(2025, 8, 14)
+ .unwrap()
+ .and_hms_nano_opt(12, 33, 54, 123456789)
+ .unwrap()
+ );
+
+ test_decoder_bounds!(
+ test_timestampntz_nanos_before_epoch,
+ [0x15, 0x41, 0x52, 0xd4, 0x94, 0xe5, 0xad, 0xfa],
+ decode_timestampntz_nanos,
+ NaiveDate::from_ymd_opt(1957, 11, 7)
+ .unwrap()
+ .and_hms_nano_opt(12, 33, 54, 123456789)
+ .unwrap()
+ );
+ }
+
+ #[test]
+ fn test_uuid() {
+ let data = [
+ 0xf2, 0x4f, 0x9b, 0x64, 0x81, 0xfa, 0x49, 0xd1, 0xb7, 0x4e, 0x8c,
0x09, 0xa6, 0xe3,
+ 0x1c, 0x56,
+ ];
+ let result = decode_uuid(&data).unwrap();
+ assert_eq!(
+ Uuid::parse_str("f24f9b64-81fa-49d1-b74e-8c09a6e31c56").unwrap(),
+ result
+ );
}
mod time {
diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs
index 62da32bebd..0bf3eed979 100644
--- a/parquet-variant/src/variant.rs
+++ b/parquet-variant/src/variant.rs
@@ -27,7 +27,8 @@ use crate::utils::{first_byte_from_slice, slice_from_slice};
use std::ops::Deref;
use arrow_schema::ArrowError;
-use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc};
+use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc};
+use uuid::Uuid;
mod decimal;
mod list;
@@ -229,6 +230,10 @@ pub enum Variant<'m, 'v> {
TimestampMicros(DateTime<Utc>),
/// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=false, MICROS)
TimestampNtzMicros(NaiveDateTime),
+ /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=true, NANOS)
+ TimestampNanos(DateTime<Utc>),
+ /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=false, NANOS)
+ TimestampNtzNanos(NaiveDateTime),
/// Primitive (type_id=1): DECIMAL(precision, scale) 32-bits
Decimal4(VariantDecimal4),
/// Primitive (type_id=1): DECIMAL(precision, scale) 64-bits
@@ -250,6 +255,8 @@ pub enum Variant<'m, 'v> {
String(&'v str),
/// Primitive (type_id=1): TIME(isAdjustedToUTC=false, MICROS)
Time(NaiveTime),
+ /// Primitive (type_id=1): UUID
+ Uuid(Uuid),
/// Short String (type_id=2): STRING
ShortString(ShortString<'v>),
// need both metadata & value
@@ -381,6 +388,13 @@ impl<'m, 'v> Variant<'m, 'v> {
VariantPrimitiveType::TimestampNtzMicros => {
Variant::TimestampNtzMicros(decoder::decode_timestampntz_micros(value_data)?)
}
+ VariantPrimitiveType::TimestampNanos => {
+
Variant::TimestampNanos(decoder::decode_timestamp_nanos(value_data)?)
+ }
+ VariantPrimitiveType::TimestampNtzNanos => {
+
Variant::TimestampNtzNanos(decoder::decode_timestampntz_nanos(value_data)?)
+ }
+ VariantPrimitiveType::Uuid =>
Variant::Uuid(decoder::decode_uuid(value_data)?),
VariantPrimitiveType::Binary => {
Variant::Binary(decoder::decode_binary(value_data)?)
}
@@ -528,11 +542,9 @@ impl<'m, 'v> Variant<'m, 'v> {
/// let datetime = NaiveDate::from_ymd_opt(2025, 4,
16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc();
/// let v1 = Variant::from(datetime);
/// assert_eq!(v1.as_datetime_utc(), Some(datetime));
- ///
- /// // or a non-UTC-adjusted variant
- /// let datetime = NaiveDate::from_ymd_opt(2025, 4,
16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap();
- /// let v2 = Variant::from(datetime);
- /// assert_eq!(v2.as_datetime_utc(), Some(datetime.and_utc()));
+ /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 8,
14).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap().and_utc();
+ /// let v2 = Variant::from(datetime_nanos);
+ /// assert_eq!(v2.as_datetime_utc(), Some(datetime_nanos));
///
/// // but not from other variants
/// let v3 = Variant::from("hello!");
@@ -540,8 +552,7 @@ impl<'m, 'v> Variant<'m, 'v> {
/// ```
pub fn as_datetime_utc(&self) -> Option<DateTime<Utc>> {
match *self {
- Variant::TimestampMicros(d) => Some(d),
- Variant::TimestampNtzMicros(d) => Some(d.and_utc()),
+ Variant::TimestampMicros(d) | Variant::TimestampNanos(d) =>
Some(d),
_ => None,
}
}
@@ -563,9 +574,9 @@ impl<'m, 'v> Variant<'m, 'v> {
/// assert_eq!(v1.as_naive_datetime(), Some(datetime));
///
/// // or a UTC-adjusted variant
- /// let datetime = NaiveDate::from_ymd_opt(2025, 4,
16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc();
+ /// let datetime = NaiveDate::from_ymd_opt(2025, 4,
16).unwrap().and_hms_nano_opt(12, 34, 56, 123456789).unwrap();
/// let v2 = Variant::from(datetime);
- /// assert_eq!(v2.as_naive_datetime(), Some(datetime.naive_utc()));
+ /// assert_eq!(v2.as_naive_datetime(), Some(datetime));
///
/// // but not from other variants
/// let v3 = Variant::from("hello!");
@@ -573,8 +584,7 @@ impl<'m, 'v> Variant<'m, 'v> {
/// ```
pub fn as_naive_datetime(&self) -> Option<NaiveDateTime> {
match *self {
- Variant::TimestampNtzMicros(d) => Some(d),
- Variant::TimestampMicros(d) => Some(d.naive_utc()),
+ Variant::TimestampNtzMicros(d) | Variant::TimestampNtzNanos(d) =>
Some(d),
_ => None,
}
}
@@ -632,6 +642,32 @@ impl<'m, 'v> Variant<'m, 'v> {
}
}
+ /// Converts this variant to a `uuid hyphenated string` if possible.
+ ///
+ /// Returns `Some(String)` for UUID variants, `None` for non-UUID variants.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use parquet_variant::Variant;
+ ///
+ /// // You can extract a UUID from a UUID variant
+ /// let s =
uuid::Uuid::parse_str("67e55044-10b1-426f-9247-bb680e5fe0c8").unwrap();
+ /// let v1 = Variant::Uuid(s);
+ /// assert_eq!(s, v1.as_uuid().unwrap());
+ /// assert_eq!("67e55044-10b1-426f-9247-bb680e5fe0c8",
v1.as_uuid().unwrap().to_string());
+ ///
+ /// //but not from other variants
+ /// let v2 = Variant::from(1234);
+ /// assert_eq!(None, v2.as_uuid())
+ /// ```
+ pub fn as_uuid(&self) -> Option<Uuid> {
+ match self {
+ Variant::Uuid(u) => Some(*u),
+ _ => None,
+ }
+ }
+
/// Converts this variant to an `i8` if possible.
///
/// Returns `Some(i8)` for integer variants that fit in `i8` range,
@@ -1262,12 +1298,21 @@ impl From<NaiveDate> for Variant<'_, '_> {
impl From<DateTime<Utc>> for Variant<'_, '_> {
fn from(value: DateTime<Utc>) -> Self {
- Variant::TimestampMicros(value)
+ if value.nanosecond() % 1000 > 0 {
+ Variant::TimestampNanos(value)
+ } else {
+ Variant::TimestampMicros(value)
+ }
}
}
+
impl From<NaiveDateTime> for Variant<'_, '_> {
fn from(value: NaiveDateTime) -> Self {
- Variant::TimestampNtzMicros(value)
+ if value.nanosecond() % 1000 > 0 {
+ Variant::TimestampNtzNanos(value)
+ } else {
+ Variant::TimestampNtzMicros(value)
+ }
}
}
@@ -1367,10 +1412,13 @@ impl std::fmt::Debug for Variant<'_, '_> {
Variant::TimestampNtzMicros(ts) => {
f.debug_tuple("TimestampNtzMicros").field(ts).finish()
}
+ Variant::TimestampNanos(ts) =>
f.debug_tuple("TimestampNanos").field(ts).finish(),
+ Variant::TimestampNtzNanos(ts) =>
f.debug_tuple("TimestampNtzNanos").field(ts).finish(),
Variant::Binary(bytes) => write!(f, "Binary({:?})",
HexString(bytes)),
Variant::String(s) => f.debug_tuple("String").field(s).finish(),
Variant::Time(s) => f.debug_tuple("Time").field(s).finish(),
Variant::ShortString(s) =>
f.debug_tuple("ShortString").field(s).finish(),
+ Variant::Uuid(uuid) => f.debug_tuple("Uuid").field(&uuid).finish(),
Variant::Object(obj) => {
let mut map = f.debug_map();
for res in obj.iter_try() {
@@ -1476,6 +1524,25 @@ mod tests {
Variant::TimestampNtzMicros(timestamp_ntz),
);
+ let timestamp_nanos_utc = chrono::NaiveDate::from_ymd_opt(2025, 8, 15)
+ .unwrap()
+ .and_hms_nano_opt(12, 3, 4, 123456789)
+ .unwrap()
+ .and_utc();
+ root_obj.insert(
+ "timestamp_nanos",
+ Variant::TimestampNanos(timestamp_nanos_utc),
+ );
+
+ let timestamp_ntz_nanos = chrono::NaiveDate::from_ymd_opt(2025, 8, 15)
+ .unwrap()
+ .and_hms_nano_opt(12, 3, 4, 123456789)
+ .unwrap();
+ root_obj.insert(
+ "timestamp_ntz_nanos",
+ Variant::TimestampNtzNanos(timestamp_ntz_nanos),
+ );
+
// Add decimal types
let decimal4 = VariantDecimal4::try_new(1234i32, 2).unwrap();
root_obj.insert("decimal4", decimal4);
@@ -1497,6 +1564,10 @@ mod tests {
let time = NaiveTime::from_hms_micro_opt(1, 2, 3, 4).unwrap();
root_obj.insert("time", time);
+ // Add uuid
+ let uuid =
Uuid::parse_str("67e55044-10b1-426f-9247-bb680e5fe0c8").unwrap();
+ root_obj.insert("uuid", Variant::Uuid(uuid));
+
// Add nested object
let mut nested_obj = root_obj.new_object("nested_object");
nested_obj.insert("inner_key1", "inner_value1");
@@ -1540,17 +1611,20 @@ mod tests {
assert!(debug_output.contains("\"date\": Date(2024-12-25)"));
assert!(debug_output.contains("\"timestamp_micros\":
TimestampMicros("));
assert!(debug_output.contains("\"timestamp_ntz_micros\":
TimestampNtzMicros("));
+ assert!(debug_output.contains("\"timestamp_nanos\": TimestampNanos("));
+ assert!(debug_output.contains("\"timestamp_ntz_nanos\":
TimestampNtzNanos("));
assert!(debug_output.contains("\"decimal4\": Decimal4("));
assert!(debug_output.contains("\"decimal8\": Decimal8("));
assert!(debug_output.contains("\"decimal16\": Decimal16("));
assert!(debug_output.contains("\"binary\": Binary(01 02 03 04 de ad be
ef)"));
assert!(debug_output.contains("\"string\": String("));
assert!(debug_output.contains("\"short_string\": ShortString("));
+ assert!(debug_output.contains("\"uuid\":
Uuid(67e55044-10b1-426f-9247-bb680e5fe0c8)"));
assert!(debug_output.contains("\"time\": Time(01:02:03.000004)"));
assert!(debug_output.contains("\"nested_object\":"));
assert!(debug_output.contains("\"mixed_list\":"));
- let expected = r#"{"binary": Binary(01 02 03 04 de ad be ef),
"boolean_false": BooleanFalse, "boolean_true": BooleanTrue, "date":
Date(2024-12-25), "decimal16": Decimal16(VariantDecimal16 { integer:
123456789012345678901234567890, scale: 4 }), "decimal4":
Decimal4(VariantDecimal4 { integer: 1234, scale: 2 }), "decimal8":
Decimal8(VariantDecimal8 { integer: 123456789, scale: 3 }), "double":
Double(1.23456789), "float": Float(1.234), "int16": Int16(1234), "int32":
Int32(123456), "i [...]
+ let expected = r#"{"binary": Binary(01 02 03 04 de ad be ef),
"boolean_false": BooleanFalse, "boolean_true": BooleanTrue, "date":
Date(2024-12-25), "decimal16": Decimal16(VariantDecimal16 { integer:
123456789012345678901234567890, scale: 4 }), "decimal4":
Decimal4(VariantDecimal4 { integer: 1234, scale: 2 }), "decimal8":
Decimal8(VariantDecimal8 { integer: 123456789, scale: 3 }), "double":
Double(1.23456789), "float": Float(1.234), "int16": Int16(1234), "int32":
Int32(123456), "i [...]
assert_eq!(debug_output, expected);
// Test alternate Debug formatter (#?)
@@ -1648,9 +1722,18 @@ mod tests {
"timestamp_micros": TimestampMicros(
2024-12-25T15:30:45.123Z,
),
+ "timestamp_nanos": TimestampNanos(
+ 2025-08-15T12:03:04.123456789Z,
+ ),
"timestamp_ntz_micros": TimestampNtzMicros(
2024-12-25T15:30:45.123,
),
+ "timestamp_ntz_nanos": TimestampNtzNanos(
+ 2025-08-15T12:03:04.123456789,
+ ),
+ "uuid": Uuid(
+ 67e55044-10b1-426f-9247-bb680e5fe0c8,
+ ),
}"#;
assert_eq!(alt_debug_output, expected);
}
diff --git a/parquet-variant/tests/variant_interop.rs
b/parquet-variant/tests/variant_interop.rs
index 1c5b8ed221..518a77f53f 100644
--- a/parquet-variant/tests/variant_interop.rs
+++ b/parquet-variant/tests/variant_interop.rs
@@ -28,6 +28,7 @@ use parquet_variant::{
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
+use uuid::Uuid;
/// Returns a directory path for the parquet variant test data.
///
@@ -126,6 +127,9 @@ fn get_primitive_cases() -> Vec<(&'static str,
Variant<'static, 'static>)> {
("primitive_string", Variant::String("This string is longer than 64
bytes and therefore does not fit in a short_string and it also includes several
non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")),
("primitive_timestamp",
Variant::TimestampMicros(NaiveDate::from_ymd_opt(2025, 4,
16).unwrap().and_hms_milli_opt(16, 34, 56, 780).unwrap().and_utc())),
("primitive_timestampntz",
Variant::TimestampNtzMicros(NaiveDate::from_ymd_opt(2025, 4,
16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap())),
+ ("primitive_timestamp_nanos",
Variant::TimestampNanos(NaiveDate::from_ymd_opt(2024, 11,
7).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap().and_utc())),
+ ("primitive_timestampntz_nanos",
Variant::TimestampNtzNanos(NaiveDate::from_ymd_opt(2024, 11,
7).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap())),
+ ("primitive_uuid",
Variant::Uuid(Uuid::parse_str("f24f9b64-81fa-49d1-b74e-8c09a6e31c56").unwrap())),
("short_string", Variant::ShortString(ShortString::try_new("Less than
64 bytes (❤\u{fe0f} with utf8)").unwrap())),
("primitive_time", Variant::Time(NaiveTime::from_hms_micro_opt(12, 33,
54, 123456).unwrap())),
]