This is an automated email from the ASF dual-hosted git repository.
liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 8068407 fix: make PrimitiveLiteral and Literal not be Ord (#386)
8068407 is described below
commit 806840729c1dd886624f6ce3cb132d0912befb4b
Author: ZENOTME <[email protected]>
AuthorDate: Sat Jun 1 11:23:29 2024 +0800
fix: make PrimitiveLiteral and Literal not be Ord (#386)
* make PrimitiveLiteral and Literal not be Ord
* refine Map
* fix name
* fix map test
* refine
---------
Co-authored-by: ZENOTME <[email protected]>
---
.../expr/visitors/inclusive_metrics_evaluator.rs | 87 ++--
crates/iceberg/src/spec/datatypes.rs | 11 +-
crates/iceberg/src/spec/manifest.rs | 80 ++--
crates/iceberg/src/spec/manifest_list.rs | 43 +-
crates/iceberg/src/spec/schema.rs | 7 +-
crates/iceberg/src/spec/values.rs | 508 ++++++++++++++++-----
crates/iceberg/src/transform/temporal.rs | 8 +-
7 files changed, 504 insertions(+), 240 deletions(-)
diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
index 1691f12..5f73f2f 100644
--- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
@@ -17,7 +17,7 @@
use crate::expr::visitors::bound_predicate_visitor::{visit,
BoundPredicateVisitor};
use crate::expr::{BoundPredicate, BoundReference};
-use crate::spec::{DataFile, Datum, Literal, PrimitiveLiteral};
+use crate::spec::{DataFile, Datum, PrimitiveLiteral};
use crate::{Error, ErrorKind};
use fnv::FnvHashSet;
@@ -63,11 +63,11 @@ impl<'a> InclusiveMetricsEvaluator<'a> {
self.data_file.value_counts.get(&field_id)
}
- fn lower_bound(&self, field_id: i32) -> Option<&Literal> {
+ fn lower_bound(&self, field_id: i32) -> Option<&Datum> {
self.data_file.lower_bounds.get(&field_id)
}
- fn upper_bound(&self, field_id: i32) -> Option<&Literal> {
+ fn upper_bound(&self, field_id: i32) -> Option<&Datum> {
self.data_file.upper_bounds.get(&field_id)
}
@@ -97,7 +97,7 @@ impl<'a> InclusiveMetricsEvaluator<'a> {
&mut self,
reference: &BoundReference,
datum: &Datum,
- cmp_fn: fn(&PrimitiveLiteral, &PrimitiveLiteral) -> bool,
+ cmp_fn: fn(&Datum, &Datum) -> bool,
use_lower_bound: bool,
) -> crate::Result<bool> {
let field_id = reference.field().id;
@@ -119,14 +119,7 @@ impl<'a> InclusiveMetricsEvaluator<'a> {
};
if let Some(bound) = bound {
- let Literal::Primitive(bound) = bound else {
- return Err(Error::new(
- ErrorKind::Unexpected,
- "Inequality Predicates can only compare against a
Primitive Literal",
- ));
- };
-
- if cmp_fn(bound, datum.literal()) {
+ if cmp_fn(bound, datum) {
return ROWS_MIGHT_MATCH;
}
@@ -265,33 +258,21 @@ impl BoundPredicateVisitor for
InclusiveMetricsEvaluator<'_> {
}
if let Some(lower_bound) = self.lower_bound(field_id) {
- let Literal::Primitive(lower_bound) = lower_bound else {
- return Err(Error::new(
- ErrorKind::Unexpected,
- "Eq Predicate can only compare against a Primitive
Literal",
- ));
- };
if lower_bound.is_nan() {
// NaN indicates unreliable bounds.
// See the InclusiveMetricsEvaluator docs for more.
return ROWS_MIGHT_MATCH;
- } else if lower_bound.gt(datum.literal()) {
+ } else if lower_bound.gt(datum) {
return ROWS_CANNOT_MATCH;
}
}
if let Some(upper_bound) = self.upper_bound(field_id) {
- let Literal::Primitive(upper_bound) = upper_bound else {
- return Err(Error::new(
- ErrorKind::Unexpected,
- "Eq Predicate can only compare against a Primitive
Literal",
- ));
- };
if upper_bound.is_nan() {
// NaN indicates unreliable bounds.
// See the InclusiveMetricsEvaluator docs for more.
return ROWS_MIGHT_MATCH;
- } else if upper_bound.lt(datum.literal()) {
+ } else if upper_bound.lt(datum) {
return ROWS_CANNOT_MATCH;
}
}
@@ -331,7 +312,7 @@ impl BoundPredicateVisitor for
InclusiveMetricsEvaluator<'_> {
};
if let Some(lower_bound) = self.lower_bound(field_id) {
- let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) =
lower_bound else {
+ let PrimitiveLiteral::String(lower_bound) = lower_bound.literal()
else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use StartsWith operator on non-string lower_bound
value",
@@ -349,7 +330,7 @@ impl BoundPredicateVisitor for
InclusiveMetricsEvaluator<'_> {
}
if let Some(upper_bound) = self.upper_bound(field_id) {
- let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) =
upper_bound else {
+ let PrimitiveLiteral::String(upper_bound) = upper_bound.literal()
else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use StartsWith operator on non-string upper_bound
value",
@@ -395,7 +376,7 @@ impl BoundPredicateVisitor for
InclusiveMetricsEvaluator<'_> {
return ROWS_MIGHT_MATCH;
};
- let Literal::Primitive(PrimitiveLiteral::String(lower_bound_str)) =
lower_bound else {
+ let PrimitiveLiteral::String(lower_bound_str) = lower_bound.literal()
else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use NotStartsWith operator on non-string lower_bound
value",
@@ -416,7 +397,7 @@ impl BoundPredicateVisitor for
InclusiveMetricsEvaluator<'_> {
return ROWS_MIGHT_MATCH;
};
- let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) =
upper_bound else {
+ let PrimitiveLiteral::String(upper_bound) = upper_bound.literal()
else {
return Err(Error::new(
ErrorKind::Unexpected,
"Cannot use NotStartsWith operator on non-string
upper_bound value",
@@ -456,36 +437,24 @@ impl BoundPredicateVisitor for
InclusiveMetricsEvaluator<'_> {
}
if let Some(lower_bound) = self.lower_bound(field_id) {
- let Literal::Primitive(lower_bound) = lower_bound else {
- return Err(Error::new(
- ErrorKind::Unexpected,
- "Eq Predicate can only compare against a Primitive
Literal",
- ));
- };
if lower_bound.is_nan() {
// NaN indicates unreliable bounds. See the
InclusiveMetricsEvaluator docs for more.
return ROWS_MIGHT_MATCH;
}
- if !literals.iter().any(|datum| datum.literal().ge(lower_bound)) {
+ if !literals.iter().any(|datum| datum.ge(lower_bound)) {
// if all values are less than lower bound, rows cannot match.
return ROWS_CANNOT_MATCH;
}
}
if let Some(upper_bound) = self.upper_bound(field_id) {
- let Literal::Primitive(upper_bound) = upper_bound else {
- return Err(Error::new(
- ErrorKind::Unexpected,
- "Eq Predicate can only compare against a Primitive
Literal",
- ));
- };
if upper_bound.is_nan() {
// NaN indicates unreliable bounds. See the
InclusiveMetricsEvaluator docs for more.
return ROWS_MIGHT_MATCH;
}
- if !literals.iter().any(|datum| datum.literal().le(upper_bound)) {
+ if !literals.iter().any(|datum| datum.le(upper_bound)) {
// if all values are greater than upper bound, rows cannot
match.
return ROWS_CANNOT_MATCH;
}
@@ -519,7 +488,7 @@ mod test {
UnaryExpression,
};
use crate::spec::{
- DataContentType, DataFile, DataFileFormat, Datum, Literal,
NestedField, PartitionField,
+ DataContentType, DataFile, DataFileFormat, Datum, NestedField,
PartitionField,
PartitionSpec, PrimitiveType, Schema, Struct, Transform, Type,
};
use fnv::FnvHashSet;
@@ -2152,17 +2121,17 @@ mod test {
nan_value_counts: HashMap::from([(7, 50), (8, 10), (9, 0)]),
lower_bounds: HashMap::from([
- (1, Literal::int(INT_MIN_VALUE)),
- (11, Literal::float(f32::NAN)),
- (12, Literal::double(f64::NAN)),
- (14, Literal::string("")),
+ (1, Datum::int(INT_MIN_VALUE)),
+ (11, Datum::float(f32::NAN)),
+ (12, Datum::double(f64::NAN)),
+ (14, Datum::string("")),
]),
upper_bounds: HashMap::from([
- (1, Literal::int(INT_MAX_VALUE)),
- (11, Literal::float(f32::NAN)),
- (12, Literal::double(f64::NAN)),
- (14, Literal::string("房东整租霍营小区二层两居室")),
+ (1, Datum::int(INT_MAX_VALUE)),
+ (11, Datum::float(f32::NAN)),
+ (12, Datum::double(f64::NAN)),
+ (14, Datum::string("房东整租霍营小区二层两居室")),
]),
column_sizes: Default::default(),
@@ -2187,9 +2156,9 @@ mod test {
nan_value_counts: HashMap::default(),
- lower_bounds: HashMap::from([(3, Literal::string("aa"))]),
+ lower_bounds: HashMap::from([(3, Datum::string("aa"))]),
- upper_bounds: HashMap::from([(3, Literal::string("dC"))]),
+ upper_bounds: HashMap::from([(3, Datum::string("dC"))]),
column_sizes: Default::default(),
key_metadata: vec![],
@@ -2214,9 +2183,9 @@ mod test {
nan_value_counts: HashMap::default(),
- lower_bounds: HashMap::from([(3, Literal::string("1str1"))]),
+ lower_bounds: HashMap::from([(3, Datum::string("1str1"))]),
- upper_bounds: HashMap::from([(3, Literal::string("3str3"))]),
+ upper_bounds: HashMap::from([(3, Datum::string("3str3"))]),
column_sizes: Default::default(),
key_metadata: vec![],
@@ -2241,9 +2210,9 @@ mod test {
nan_value_counts: HashMap::default(),
- lower_bounds: HashMap::from([(3, Literal::string("abc"))]),
+ lower_bounds: HashMap::from([(3, Datum::string("abc"))]),
- upper_bounds: HashMap::from([(3, Literal::string("イロハニホヘト"))]),
+ upper_bounds: HashMap::from([(3, Datum::string("イロハニホヘト"))]),
column_sizes: Default::default(),
key_metadata: vec![],
diff --git a/crates/iceberg/src/spec/datatypes.rs
b/crates/iceberg/src/spec/datatypes.rs
index 167af35..cc911fa 100644
--- a/crates/iceberg/src/spec/datatypes.rs
+++ b/crates/iceberg/src/spec/datatypes.rs
@@ -118,8 +118,17 @@ impl Type {
matches!(self, Type::Struct(_) | Type::List(_) | Type::Map(_))
}
+ /// Convert Type to reference of PrimitiveType
+ pub fn as_primitive_type(&self) -> Option<&PrimitiveType> {
+ if let Type::Primitive(primitive_type) = self {
+ Some(primitive_type)
+ } else {
+ None
+ }
+ }
+
/// Convert Type to StructType
- pub fn as_struct_type(self) -> Option<StructType> {
+ pub fn to_struct_type(self) -> Option<StructType> {
if let Type::Struct(struct_type) = self {
Some(struct_type)
} else {
diff --git a/crates/iceberg/src/spec/manifest.rs
b/crates/iceberg/src/spec/manifest.rs
index f61c0ac..f5a5984 100644
--- a/crates/iceberg/src/spec/manifest.rs
+++ b/crates/iceberg/src/spec/manifest.rs
@@ -18,11 +18,11 @@
//! Manifest for Iceberg.
use self::_const_schema::{manifest_schema_v1, manifest_schema_v2};
+use super::UNASSIGNED_SEQUENCE_NUMBER;
use super::{
- FieldSummary, FormatVersion, ManifestContentType, ManifestFile,
PartitionSpec, Schema,
+ Datum, FieldSummary, FormatVersion, ManifestContentType, ManifestFile,
PartitionSpec, Schema,
SchemaId, Struct, INITIAL_SEQUENCE_NUMBER,
};
-use super::{Literal, UNASSIGNED_SEQUENCE_NUMBER};
use crate::error::Result;
use crate::io::OutputFile;
use crate::spec::PartitionField;
@@ -1003,7 +1003,7 @@ pub struct DataFile {
///
/// - [Binary single-value
serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization)
#[builder(default)]
- pub(crate) lower_bounds: HashMap<i32, Literal>,
+ pub(crate) lower_bounds: HashMap<i32, Datum>,
/// field id: 128
/// key field id: 129
/// value field id: 130
@@ -1016,7 +1016,7 @@ pub struct DataFile {
///
/// - [Binary single-value
serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization)
#[builder(default)]
- pub(crate) upper_bounds: HashMap<i32, Literal>,
+ pub(crate) upper_bounds: HashMap<i32, Datum>,
/// field id: 131
///
/// Implementation-specific key metadata for encryption
@@ -1102,12 +1102,12 @@ impl DataFile {
}
/// Get the lower bounds of the data file values per column.
/// Map from column id to lower bound in the column serialized as binary.
- pub fn lower_bounds(&self) -> &HashMap<i32, Literal> {
+ pub fn lower_bounds(&self) -> &HashMap<i32, Datum> {
&self.lower_bounds
}
/// Get the upper bounds of the data file values per column.
/// Map from column id to upper bound in the column serialized as binary.
- pub fn upper_bounds(&self) -> &HashMap<i32, Literal> {
+ pub fn upper_bounds(&self) -> &HashMap<i32, Datum> {
&self.upper_bounds
}
/// Get the Implementation-specific key metadata for the data file.
@@ -1207,7 +1207,9 @@ mod _serde {
use serde_derive::{Deserialize, Serialize};
use serde_with::serde_as;
+ use crate::spec::Datum;
use crate::spec::Literal;
+ use crate::spec::PrimitiveLiteral;
use crate::spec::RawLiteral;
use crate::spec::Schema;
use crate::spec::Struct;
@@ -1331,8 +1333,12 @@ mod _serde {
value_counts: Some(to_i64_entry(value.value_counts)?),
null_value_counts:
Some(to_i64_entry(value.null_value_counts)?),
nan_value_counts: Some(to_i64_entry(value.nan_value_counts)?),
- lower_bounds: Some(to_bytes_entry(value.lower_bounds)),
- upper_bounds: Some(to_bytes_entry(value.upper_bounds)),
+ lower_bounds: Some(to_bytes_entry(
+ value.lower_bounds.into_iter().map(|(k, v)| (k, v.into())),
+ )),
+ upper_bounds: Some(to_bytes_entry(
+ value.upper_bounds.into_iter().map(|(k, v)| (k, v.into())),
+ )),
key_metadata:
Some(serde_bytes::ByteBuf::from(value.key_metadata)),
split_offsets: Some(value.split_offsets),
equality_ids: Some(value.equality_ids),
@@ -1415,19 +1421,28 @@ mod _serde {
fn parse_bytes_entry(
v: Vec<BytesEntry>,
schema: &Schema,
- ) -> Result<HashMap<i32, Literal>, Error> {
+ ) -> Result<HashMap<i32, Datum>, Error> {
let mut m = HashMap::with_capacity(v.len());
for entry in v {
// We ignore the entry if the field is not found in the schema,
due to schema evolution.
if let Some(field) = schema.field_by_id(entry.key) {
- let data_type = &field.field_type;
- m.insert(entry.key, Literal::try_from_bytes(&entry.value,
data_type)?);
+ let data_type = field
+ .field_type
+ .as_primitive_type()
+ .ok_or_else(|| {
+ Error::new(
+ ErrorKind::DataInvalid,
+ format!("field {} is not a primitive type",
field.name),
+ )
+ })?
+ .clone();
+ m.insert(entry.key, Datum::try_from_bytes(&entry.value,
data_type)?);
}
}
Ok(m)
}
- fn to_bytes_entry(v: HashMap<i32, Literal>) -> Vec<BytesEntry> {
+ fn to_bytes_entry(v: impl IntoIterator<Item = (i32, PrimitiveLiteral)>) ->
Vec<BytesEntry> {
v.into_iter()
.map(|e| BytesEntry {
key: e.0,
@@ -1495,6 +1510,7 @@ mod tests {
use super::*;
use crate::io::FileIOBuilder;
+ use crate::spec::Literal;
use crate::spec::NestedField;
use crate::spec::PrimitiveType;
use crate::spec::Struct;
@@ -1824,8 +1840,8 @@ mod tests {
value_counts: HashMap::from([(1,1),(2,1),(3,1)]),
null_value_counts: HashMap::from([(1,0),(2,0),(3,0)]),
nan_value_counts: HashMap::new(),
- lower_bounds:
HashMap::from([(1,Literal::int(1)),(2,Literal::string("a")),(3,Literal::string("AC/DC"))]),
- upper_bounds:
HashMap::from([(1,Literal::int(1)),(2,Literal::string("a")),(3,Literal::string("AC/DC"))]),
+ lower_bounds:
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
+ upper_bounds:
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
key_metadata: vec![],
split_offsets: vec![4],
equality_ids: vec![],
@@ -1903,14 +1919,14 @@ mod tests {
null_value_counts: HashMap::from([(1, 0), (2, 0), (3,
0)]),
nan_value_counts: HashMap::new(),
lower_bounds: HashMap::from([
- (1, Literal::long(1)),
- (2, Literal::string("a")),
- (3, Literal::string("x"))
+ (1, Datum::long(1)),
+ (2, Datum::string("a")),
+ (3, Datum::string("x"))
]),
upper_bounds: HashMap::from([
- (1, Literal::long(1)),
- (2, Literal::string("a")),
- (3, Literal::string("x"))
+ (1, Datum::long(1)),
+ (2, Datum::string("a")),
+ (3, Datum::string("x"))
]),
key_metadata: vec![],
split_offsets: vec![4],
@@ -1926,8 +1942,8 @@ mod tests {
let entry = test_manifest_read_write(manifest, writer).await;
assert_eq!(entry.partitions.len(), 1);
- assert_eq!(entry.partitions[0].lower_bound,
Some(Literal::string("x")));
- assert_eq!(entry.partitions[0].upper_bound,
Some(Literal::string("x")));
+ assert_eq!(entry.partitions[0].lower_bound, Some(Datum::string("x")));
+ assert_eq!(entry.partitions[0].upper_bound, Some(Datum::string("x")));
}
#[tokio::test]
@@ -1978,14 +1994,14 @@ mod tests {
null_value_counts: HashMap::default(),
nan_value_counts: HashMap::new(),
lower_bounds: HashMap::from([
- (1, Literal::long(1)),
- (2, Literal::int(2)),
- (3, Literal::string("x"))
+ (1, Datum::long(1)),
+ (2, Datum::int(2)),
+ (3, Datum::string("x"))
]),
upper_bounds: HashMap::from([
- (1, Literal::long(1)),
- (2, Literal::int(2)),
- (3, Literal::string("x"))
+ (1, Datum::long(1)),
+ (2, Datum::int(2)),
+ (3, Datum::string("x"))
]),
key_metadata: vec![],
split_offsets: vec![4],
@@ -2050,12 +2066,12 @@ mod tests {
null_value_counts: HashMap::default(),
nan_value_counts: HashMap::new(),
lower_bounds: HashMap::from([
- (1, Literal::long(1)),
- (2, Literal::int(2)),
+ (1, Datum::long(1)),
+ (2, Datum::int(2)),
]),
upper_bounds: HashMap::from([
- (1, Literal::long(1)),
- (2, Literal::int(2)),
+ (1, Datum::long(1)),
+ (2, Datum::int(2)),
]),
key_metadata: vec![],
split_offsets: vec![4],
diff --git a/crates/iceberg/src/spec/manifest_list.rs
b/crates/iceberg/src/spec/manifest_list.rs
index 2bb5e30..ec7e2d8 100644
--- a/crates/iceberg/src/spec/manifest_list.rs
+++ b/crates/iceberg/src/spec/manifest_list.rs
@@ -20,7 +20,7 @@
use std::{collections::HashMap, str::FromStr};
use crate::io::FileIO;
-use crate::{io::OutputFile, spec::Literal, Error, ErrorKind};
+use crate::{io::OutputFile, Error, ErrorKind};
use apache_avro::{from_value, types::Value, Reader, Writer};
use bytes::Bytes;
@@ -29,7 +29,7 @@ use self::{
_serde::{ManifestFileV1, ManifestFileV2},
};
-use super::{FormatVersion, Manifest, StructType};
+use super::{Datum, FormatVersion, Manifest, StructType};
use crate::error::Result;
/// Placeholder for sequence number. The field with this value must be
replaced with the actual sequence number before it write.
@@ -662,11 +662,11 @@ pub struct FieldSummary {
/// field: 510
/// The minimum value for the field in the manifests
/// partitions.
- pub lower_bound: Option<Literal>,
+ pub lower_bound: Option<Datum>,
/// field: 511
/// The maximum value for the field in the manifests
/// partitions.
- pub upper_bound: Option<Literal>,
+ pub upper_bound: Option<Datum>,
}
/// This is a helper module that defines types to help with
serialization/deserialization.
@@ -675,7 +675,7 @@ pub struct FieldSummary {
/// [ManifestFileV1] and [ManifestFileV2] are internal struct that are only
used for serialization and deserialization.
pub(super) mod _serde {
use crate::{
- spec::{Literal, StructType, Type},
+ spec::{Datum, PrimitiveLiteral, PrimitiveType, StructType},
Error,
};
pub use serde_bytes::ByteBuf;
@@ -833,17 +833,17 @@ pub(super) mod _serde {
/// Converts the [FieldSummary] into a [super::FieldSummary].
/// [lower_bound] and [upper_bound] are converted into [Literal]s need
the type info so use
/// this function instead of [std::TryFrom] trait.
- pub(crate) fn try_into(self, r#type: &Type) ->
Result<super::FieldSummary> {
+ pub(crate) fn try_into(self, r#type: &PrimitiveType) ->
Result<super::FieldSummary> {
Ok(super::FieldSummary {
contains_null: self.contains_null,
contains_nan: self.contains_nan,
lower_bound: self
.lower_bound
- .map(|v| Literal::try_from_bytes(&v, r#type))
+ .map(|v| Datum::try_from_bytes(&v, r#type.clone()))
.transpose()?,
upper_bound: self
.upper_bound
- .map(|v| Literal::try_from_bytes(&v, r#type))
+ .map(|v| Datum::try_from_bytes(&v, r#type.clone()))
.transpose()?,
})
}
@@ -869,7 +869,14 @@ pub(super) mod _serde {
partitions
.into_iter()
.zip(partition_types)
- .map(|(v, field)| v.try_into(&field.field_type))
+ .map(|(v, field)| {
+
v.try_into(field.field_type.as_primitive_type().ok_or_else(|| {
+ Error::new(
+ crate::ErrorKind::DataInvalid,
+ "Invalid partition spec. Field type is not
primitive",
+ )
+ })?)
+ })
.collect::<Result<Vec<_>>>()
} else {
Err(Error::new(
@@ -958,8 +965,8 @@ pub(super) mod _serde {
.map(|v| FieldSummary {
contains_null: v.contains_null,
contains_nan: v.contains_nan,
- lower_bound: v.lower_bound.map(|v| v.into()),
- upper_bound: v.upper_bound.map(|v| v.into()),
+ lower_bound: v.lower_bound.map(|v|
PrimitiveLiteral::from(v).into()),
+ upper_bound: v.upper_bound.map(|v|
PrimitiveLiteral::from(v).into()),
})
.collect(),
)
@@ -1096,7 +1103,7 @@ mod test {
use crate::{
io::FileIOBuilder,
spec::{
- manifest_list::_serde::ManifestListV1, FieldSummary, Literal,
ManifestContentType,
+ manifest_list::_serde::ManifestListV1, Datum, FieldSummary,
ManifestContentType,
ManifestFile, ManifestList, ManifestListWriter, NestedField,
PrimitiveType, StructType,
Type, UNASSIGNED_SEQUENCE_NUMBER,
},
@@ -1172,7 +1179,7 @@ mod test {
added_rows_count: Some(3),
existing_rows_count: Some(0),
deleted_rows_count: Some(0),
- partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Literal::long(1)), upper_bound:
Some(Literal::long(1))}],
+ partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Datum::long(1)), upper_bound:
Some(Datum::long(1))}],
key_metadata: vec![],
},
ManifestFile {
@@ -1189,7 +1196,7 @@ mod test {
added_rows_count: Some(3),
existing_rows_count: Some(0),
deleted_rows_count: Some(0),
- partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Literal::float(1.1)), upper_bound:
Some(Literal::float(2.1))}],
+ partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Datum::float(1.1)), upper_bound:
Some(Datum::float(2.1))}],
key_metadata: vec![],
}
]
@@ -1288,7 +1295,7 @@ mod test {
added_rows_count: Some(3),
existing_rows_count: Some(0),
deleted_rows_count: Some(0),
- partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Literal::long(1)), upper_bound:
Some(Literal::long(1))}],
+ partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Datum::long(1)), upper_bound:
Some(Datum::long(1))}],
key_metadata: vec![],
}]
}.try_into().unwrap();
@@ -1316,7 +1323,7 @@ mod test {
added_rows_count: Some(3),
existing_rows_count: Some(0),
deleted_rows_count: Some(0),
- partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Literal::long(1)), upper_bound:
Some(Literal::long(1))}],
+ partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Datum::long(1)), upper_bound:
Some(Datum::long(1))}],
key_metadata: vec![],
}]
};
@@ -1372,7 +1379,7 @@ mod test {
added_rows_count: Some(3),
existing_rows_count: Some(0),
deleted_rows_count: Some(0),
- partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Literal::long(1)), upper_bound:
Some(Literal::long(1))}],
+ partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Datum::long(1)), upper_bound:
Some(Datum::long(1))}],
key_metadata: vec![],
}]
};
@@ -1426,7 +1433,7 @@ mod test {
added_rows_count: Some(3),
existing_rows_count: Some(0),
deleted_rows_count: Some(0),
- partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Literal::long(1)), upper_bound:
Some(Literal::long(1))}],
+ partitions: vec![FieldSummary { contains_null: false,
contains_nan: Some(false), lower_bound: Some(Datum::long(1)), upper_bound:
Some(Datum::long(1))}],
key_metadata: vec![],
}]
};
diff --git a/crates/iceberg/src/spec/schema.rs
b/crates/iceberg/src/spec/schema.rs
index 9364b3a..4302e3b 100644
--- a/crates/iceberg/src/spec/schema.rs
+++ b/crates/iceberg/src/spec/schema.rs
@@ -1056,8 +1056,9 @@ mod tests {
};
use crate::spec::schema::Schema;
use crate::spec::schema::_serde::{SchemaEnum, SchemaV1, SchemaV2};
+ use crate::spec::values::Map as MapValue;
use crate::spec::{prune_columns, Datum, Literal};
- use std::collections::{BTreeMap, HashMap, HashSet};
+ use std::collections::{HashMap, HashSet};
use super::DEFAULT_SCHEMA_ID;
@@ -1657,9 +1658,9 @@ table {
Some(Literal::string("qux item 1")),
Some(Literal::string("qux item 2")),
])),
- Some(Literal::Map(BTreeMap::from([(
+ Some(Literal::Map(MapValue::from([(
Literal::string("quux key 1"),
- Some(Literal::Map(BTreeMap::from([(
+ Some(Literal::Map(MapValue::from([(
Literal::string("quux nested key 1"),
Some(Literal::int(1000)),
)]))),
diff --git a/crates/iceberg/src/spec/values.rs
b/crates/iceberg/src/spec/values.rs
index c05d96f..57eecdc 100644
--- a/crates/iceberg/src/spec/values.rs
+++ b/crates/iceberg/src/spec/values.rs
@@ -19,10 +19,12 @@
* Value in iceberg
*/
+use std::any::Any;
+use std::collections::HashMap;
use std::fmt::{Display, Formatter};
+use std::hash::Hash;
use std::ops::Index;
use std::str::FromStr;
-use std::{any::Any, collections::BTreeMap};
use bitvec::vec::BitVec;
use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc};
@@ -48,7 +50,7 @@ use super::datatypes::{PrimitiveType, Type};
const MAX_TIME_VALUE: i64 = 24 * 60 * 60 * 1_000_000i64 - 1;
/// Values present in iceberg type
-#[derive(Clone, Debug, PartialEq, Hash, Eq, PartialOrd, Ord)]
+#[derive(Clone, Debug, PartialEq, Hash, Eq)]
pub enum PrimitiveLiteral {
/// 0x00 for false, non-zero byte for true
Boolean(bool),
@@ -67,7 +69,7 @@ pub enum PrimitiveLiteral {
/// Timestamp without timezone
Timestamp(i64),
/// Timestamp with timezone
- TimestampTZ(i64),
+ Timestamptz(i64),
/// UTF-8 bytes (without length)
String(String),
/// 16-byte big-endian value
@@ -103,6 +105,109 @@ pub struct Datum {
literal: PrimitiveLiteral,
}
+impl PartialOrd for Datum {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ match (&self.literal, &other.literal, &self.r#type, &other.r#type) {
+ // generate the arm with same type and same literal
+ (
+ PrimitiveLiteral::Boolean(val),
+ PrimitiveLiteral::Boolean(other_val),
+ PrimitiveType::Boolean,
+ PrimitiveType::Boolean,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Int(val),
+ PrimitiveLiteral::Int(other_val),
+ PrimitiveType::Int,
+ PrimitiveType::Int,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Long(val),
+ PrimitiveLiteral::Long(other_val),
+ PrimitiveType::Long,
+ PrimitiveType::Long,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Float(val),
+ PrimitiveLiteral::Float(other_val),
+ PrimitiveType::Float,
+ PrimitiveType::Float,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Double(val),
+ PrimitiveLiteral::Double(other_val),
+ PrimitiveType::Double,
+ PrimitiveType::Double,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Date(val),
+ PrimitiveLiteral::Date(other_val),
+ PrimitiveType::Date,
+ PrimitiveType::Date,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Time(val),
+ PrimitiveLiteral::Time(other_val),
+ PrimitiveType::Time,
+ PrimitiveType::Time,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Timestamp(val),
+ PrimitiveLiteral::Timestamp(other_val),
+ PrimitiveType::Timestamp,
+ PrimitiveType::Timestamp,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Timestamptz(val),
+ PrimitiveLiteral::Timestamptz(other_val),
+ PrimitiveType::Timestamptz,
+ PrimitiveType::Timestamptz,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::String(val),
+ PrimitiveLiteral::String(other_val),
+ PrimitiveType::String,
+ PrimitiveType::String,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::UUID(val),
+ PrimitiveLiteral::UUID(other_val),
+ PrimitiveType::Uuid,
+ PrimitiveType::Uuid,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Fixed(val),
+ PrimitiveLiteral::Fixed(other_val),
+ PrimitiveType::Fixed(_),
+ PrimitiveType::Fixed(_),
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Binary(val),
+ PrimitiveLiteral::Binary(other_val),
+ PrimitiveType::Binary,
+ PrimitiveType::Binary,
+ ) => val.partial_cmp(other_val),
+ (
+ PrimitiveLiteral::Decimal(val),
+ PrimitiveLiteral::Decimal(other_val),
+ PrimitiveType::Decimal {
+ precision: _,
+ scale,
+ },
+ PrimitiveType::Decimal {
+ precision: _,
+ scale: other_scale,
+ },
+ ) => {
+ let val = Decimal::from_i128_with_scale(*val, *scale);
+ let other_val = Decimal::from_i128_with_scale(*other_val,
*other_scale);
+ val.partial_cmp(&other_val)
+ }
+ _ => None,
+ }
+ }
+}
+
impl Display for Datum {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match (&self.r#type, &self.literal) {
@@ -116,7 +221,7 @@ impl Display for Datum {
(_, PrimitiveLiteral::Timestamp(val)) => {
write!(f, "{}", microseconds_to_datetime(*val))
}
- (_, PrimitiveLiteral::TimestampTZ(val)) => {
+ (_, PrimitiveLiteral::Timestamptz(val)) => {
write!(f, "{}", microseconds_to_datetimetz(*val))
}
(_, PrimitiveLiteral::String(val)) => write!(f, r#""{}""#, val),
@@ -153,12 +258,60 @@ impl From<Datum> for Literal {
}
}
+impl From<Datum> for PrimitiveLiteral {
+ fn from(value: Datum) -> Self {
+ value.literal
+ }
+}
+
impl Datum {
/// Creates a `Datum` from a `PrimitiveType` and a `PrimitiveLiteral`
pub(crate) fn new(r#type: PrimitiveType, literal: PrimitiveLiteral) ->
Self {
Datum { r#type, literal }
}
+ /// Create iceberg value from bytes
+ pub fn try_from_bytes(bytes: &[u8], data_type: PrimitiveType) ->
Result<Self> {
+ let literal = match data_type {
+ PrimitiveType::Boolean => {
+ if bytes.len() == 1 && bytes[0] == 0u8 {
+ PrimitiveLiteral::Boolean(false)
+ } else {
+ PrimitiveLiteral::Boolean(true)
+ }
+ }
+ PrimitiveType::Int =>
PrimitiveLiteral::Int(i32::from_le_bytes(bytes.try_into()?)),
+ PrimitiveType::Long =>
PrimitiveLiteral::Long(i64::from_le_bytes(bytes.try_into()?)),
+ PrimitiveType::Float => {
+
PrimitiveLiteral::Float(OrderedFloat(f32::from_le_bytes(bytes.try_into()?)))
+ }
+ PrimitiveType::Double => {
+
PrimitiveLiteral::Double(OrderedFloat(f64::from_le_bytes(bytes.try_into()?)))
+ }
+ PrimitiveType::Date =>
PrimitiveLiteral::Date(i32::from_le_bytes(bytes.try_into()?)),
+ PrimitiveType::Time =>
PrimitiveLiteral::Time(i64::from_le_bytes(bytes.try_into()?)),
+ PrimitiveType::Timestamp => {
+
PrimitiveLiteral::Timestamp(i64::from_le_bytes(bytes.try_into()?))
+ }
+ PrimitiveType::Timestamptz => {
+
PrimitiveLiteral::Timestamptz(i64::from_le_bytes(bytes.try_into()?))
+ }
+ PrimitiveType::String => {
+
PrimitiveLiteral::String(std::str::from_utf8(bytes)?.to_string())
+ }
+ PrimitiveType::Uuid => {
+
PrimitiveLiteral::UUID(Uuid::from_u128(u128::from_be_bytes(bytes.try_into()?)))
+ }
+ PrimitiveType::Fixed(_) =>
PrimitiveLiteral::Fixed(Vec::from(bytes)),
+ PrimitiveType::Binary =>
PrimitiveLiteral::Binary(Vec::from(bytes)),
+ PrimitiveType::Decimal {
+ precision: _,
+ scale: _,
+ } => todo!(),
+ };
+ Ok(Datum::new(data_type, literal))
+ }
+
/// Creates a boolean value.
///
/// Example:
@@ -499,7 +652,7 @@ impl Datum {
pub fn timestamptz_micros(value: i64) -> Self {
Self {
r#type: PrimitiveType::Timestamptz,
- literal: PrimitiveLiteral::TimestampTZ(value),
+ literal: PrimitiveLiteral::Timestamptz(value),
}
}
@@ -713,8 +866,114 @@ impl Datum {
}
}
+/// Map is a collection of key-value pairs with a key type and a value type.
+/// It used in Literal::Map, to make it hashable, the order of key-value pairs
is stored in a separate vector
+/// so that we can hash the map in a deterministic way. But it also means that
the order of key-value pairs is matter
+/// for the hash value.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct Map {
+ index: HashMap<Literal, usize>,
+ pair: Vec<(Literal, Option<Literal>)>,
+}
+
+impl Map {
+ /// Creates a new empty map.
+ pub fn new() -> Self {
+ Self {
+ index: HashMap::new(),
+ pair: Vec::new(),
+ }
+ }
+
+ /// Return the number of key-value pairs in the map.
+ pub fn len(&self) -> usize {
+ self.pair.len()
+ }
+
+ /// Returns true if the map contains no elements.
+ pub fn is_empty(&self) -> bool {
+ self.pair.is_empty()
+ }
+
+ /// Inserts a key-value pair into the map.
+ /// If the map did not have this key present, None is returned.
+ /// If the map did have this key present, the value is updated, and the
old value is returned.
+ pub fn insert(&mut self, key: Literal, value: Option<Literal>) ->
Option<Option<Literal>> {
+ if let Some(index) = self.index.get(&key) {
+ let old_value = std::mem::replace(&mut self.pair[*index].1, value);
+ Some(old_value)
+ } else {
+ self.pair.push((key.clone(), value));
+ self.index.insert(key, self.pair.len() - 1);
+ None
+ }
+ }
+
+ /// Returns a reference to the value corresponding to the key.
+ /// If the key is not present in the map, None is returned.
+ pub fn get(&self, key: &Literal) -> Option<&Option<Literal>> {
+ self.index.get(key).map(|index| &self.pair[*index].1)
+ }
+
+ /// The order of map is matter, so this method used to compare two maps
has same key-value pairs without considering the order.
+ pub fn has_same_content(&self, other: &Map) -> bool {
+ if self.len() != other.len() {
+ return false;
+ }
+
+ for (key, value) in &self.pair {
+ match other.get(key) {
+ Some(other_value) if value == other_value => (),
+ _ => return false,
+ }
+ }
+
+ true
+ }
+}
+
+impl Default for Map {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl Hash for Map {
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+ for (key, value) in &self.pair {
+ key.hash(state);
+ value.hash(state);
+ }
+ }
+}
+
+impl FromIterator<(Literal, Option<Literal>)> for Map {
+ fn from_iter<T: IntoIterator<Item = (Literal, Option<Literal>)>>(iter: T)
-> Self {
+ let mut map = Map::new();
+ for (key, value) in iter {
+ map.insert(key, value);
+ }
+ map
+ }
+}
+
+impl IntoIterator for Map {
+ type Item = (Literal, Option<Literal>);
+ type IntoIter = std::vec::IntoIter<Self::Item>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.pair.into_iter()
+ }
+}
+
+impl<const N: usize> From<[(Literal, Option<Literal>); N]> for Map {
+ fn from(value: [(Literal, Option<Literal>); N]) -> Self {
+ value.iter().cloned().collect()
+ }
+}
+
/// Values present in iceberg type
-#[derive(Clone, Debug, PartialEq, Hash, Eq, PartialOrd, Ord)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum Literal {
/// A primitive value
Primitive(PrimitiveLiteral),
@@ -729,7 +988,7 @@ pub enum Literal {
/// A map is a collection of key-value pairs with a key type and a value
type.
/// Both the key field and value field each have an integer id that is
unique in the table schema.
/// Map keys are required and map values can be either optional or
required. Both map keys and map values may be any type, including nested types.
- Map(BTreeMap<Literal, Option<Literal>>),
+ Map(Map),
}
impl Literal {
@@ -939,7 +1198,7 @@ impl Literal {
/// Creates a timestamp with timezone from unix epoch in microseconds.
pub fn timestamptz(value: i64) -> Self {
- Self::Primitive(PrimitiveLiteral::TimestampTZ(value))
+ Self::Primitive(PrimitiveLiteral::Timestamptz(value))
}
/// Creates a timestamp from [`DateTime`].
@@ -1077,61 +1336,73 @@ impl Literal {
}
}
+impl From<PrimitiveLiteral> for ByteBuf {
+ fn from(value: PrimitiveLiteral) -> Self {
+ match value {
+ PrimitiveLiteral::Boolean(val) => {
+ if val {
+ ByteBuf::from([1u8])
+ } else {
+ ByteBuf::from([0u8])
+ }
+ }
+ PrimitiveLiteral::Int(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Long(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Float(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Double(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Date(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Time(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Timestamp(val) =>
ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Timestamptz(val) =>
ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::String(val) => ByteBuf::from(val.as_bytes()),
+ PrimitiveLiteral::UUID(val) =>
ByteBuf::from(val.as_u128().to_be_bytes()),
+ PrimitiveLiteral::Fixed(val) => ByteBuf::from(val),
+ PrimitiveLiteral::Binary(val) => ByteBuf::from(val),
+ PrimitiveLiteral::Decimal(_) => todo!(),
+ }
+ }
+}
+
impl From<Literal> for ByteBuf {
fn from(value: Literal) -> Self {
match value {
- Literal::Primitive(prim) => match prim {
- PrimitiveLiteral::Boolean(val) => {
- if val {
- ByteBuf::from([1u8])
- } else {
- ByteBuf::from([0u8])
- }
- }
- PrimitiveLiteral::Int(val) => ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Long(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Float(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Double(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Date(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Time(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Timestamp(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::TimestampTZ(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::String(val) => ByteBuf::from(val.as_bytes()),
- PrimitiveLiteral::UUID(val) =>
ByteBuf::from(val.as_u128().to_be_bytes()),
- PrimitiveLiteral::Fixed(val) => ByteBuf::from(val),
- PrimitiveLiteral::Binary(val) => ByteBuf::from(val),
- PrimitiveLiteral::Decimal(_) => todo!(),
- },
+ Literal::Primitive(val) => val.into(),
_ => unimplemented!(),
}
}
}
+impl From<PrimitiveLiteral> for Vec<u8> {
+ fn from(value: PrimitiveLiteral) -> Self {
+ match value {
+ PrimitiveLiteral::Boolean(val) => {
+ if val {
+ Vec::from([1u8])
+ } else {
+ Vec::from([0u8])
+ }
+ }
+ PrimitiveLiteral::Int(val) => Vec::from(val.to_le_bytes()),
+ PrimitiveLiteral::Long(val) => Vec::from(val.to_le_bytes()),
+ PrimitiveLiteral::Float(val) => Vec::from(val.to_le_bytes()),
+ PrimitiveLiteral::Double(val) => Vec::from(val.to_le_bytes()),
+ PrimitiveLiteral::Date(val) => Vec::from(val.to_le_bytes()),
+ PrimitiveLiteral::Time(val) => Vec::from(val.to_le_bytes()),
+ PrimitiveLiteral::Timestamp(val) => Vec::from(val.to_le_bytes()),
+ PrimitiveLiteral::Timestamptz(val) => Vec::from(val.to_le_bytes()),
+ PrimitiveLiteral::String(val) => Vec::from(val.as_bytes()),
+ PrimitiveLiteral::UUID(val) =>
Vec::from(val.as_u128().to_be_bytes()),
+ PrimitiveLiteral::Fixed(val) => val,
+ PrimitiveLiteral::Binary(val) => val,
+ PrimitiveLiteral::Decimal(_) => todo!(),
+ }
+ }
+}
+
impl From<Literal> for Vec<u8> {
fn from(value: Literal) -> Self {
match value {
- Literal::Primitive(prim) => match prim {
- PrimitiveLiteral::Boolean(val) => {
- if val {
- Vec::from([1u8])
- } else {
- Vec::from([0u8])
- }
- }
- PrimitiveLiteral::Int(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Long(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Float(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Double(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Date(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Time(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Timestamp(val) =>
Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::TimestampTZ(val) =>
Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::String(val) => Vec::from(val.as_bytes()),
- PrimitiveLiteral::UUID(val) =>
Vec::from(val.as_u128().to_be_bytes()),
- PrimitiveLiteral::Fixed(val) => val,
- PrimitiveLiteral::Binary(val) => val,
- PrimitiveLiteral::Decimal(_) => todo!(),
- },
+ Literal::Primitive(val) => val.into(),
_ => unimplemented!(),
}
}
@@ -1140,7 +1411,7 @@ impl From<Literal> for Vec<u8> {
/// The partition struct stores the tuple of partition values for each file.
/// Its type is derived from the partition fields of the partition spec used
to write the manifest file.
/// In v2, the partition struct’s field ids must match the ids from the
partition spec.
-#[derive(Debug, Clone, PartialEq, Hash, Eq, PartialOrd, Ord)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Struct {
/// Vector to store the field values
fields: Vec<Literal>,
@@ -1237,55 +1508,10 @@ impl Literal {
/// Create iceberg value from bytes
pub fn try_from_bytes(bytes: &[u8], data_type: &Type) -> Result<Self> {
match data_type {
- Type::Primitive(primitive) => match primitive {
- PrimitiveType::Boolean => {
- if bytes.len() == 1 && bytes[0] == 0u8 {
-
Ok(Literal::Primitive(PrimitiveLiteral::Boolean(false)))
- } else {
- Ok(Literal::Primitive(PrimitiveLiteral::Boolean(true)))
- }
- }
- PrimitiveType::Int =>
Ok(Literal::Primitive(PrimitiveLiteral::Int(
- i32::from_le_bytes(bytes.try_into()?),
- ))),
- PrimitiveType::Long =>
Ok(Literal::Primitive(PrimitiveLiteral::Long(
- i64::from_le_bytes(bytes.try_into()?),
- ))),
- PrimitiveType::Float =>
Ok(Literal::Primitive(PrimitiveLiteral::Float(
- OrderedFloat(f32::from_le_bytes(bytes.try_into()?)),
- ))),
- PrimitiveType::Double =>
Ok(Literal::Primitive(PrimitiveLiteral::Double(
- OrderedFloat(f64::from_le_bytes(bytes.try_into()?)),
- ))),
- PrimitiveType::Date =>
Ok(Literal::Primitive(PrimitiveLiteral::Date(
- i32::from_le_bytes(bytes.try_into()?),
- ))),
- PrimitiveType::Time =>
Ok(Literal::Primitive(PrimitiveLiteral::Time(
- i64::from_le_bytes(bytes.try_into()?),
- ))),
- PrimitiveType::Timestamp =>
Ok(Literal::Primitive(PrimitiveLiteral::Timestamp(
- i64::from_le_bytes(bytes.try_into()?),
- ))),
- PrimitiveType::Timestamptz => Ok(Literal::Primitive(
-
PrimitiveLiteral::TimestampTZ(i64::from_le_bytes(bytes.try_into()?)),
- )),
- PrimitiveType::String =>
Ok(Literal::Primitive(PrimitiveLiteral::String(
- std::str::from_utf8(bytes)?.to_string(),
- ))),
- PrimitiveType::Uuid =>
Ok(Literal::Primitive(PrimitiveLiteral::UUID(
- Uuid::from_u128(u128::from_be_bytes(bytes.try_into()?)),
- ))),
- PrimitiveType::Fixed(_) =>
Ok(Literal::Primitive(PrimitiveLiteral::Fixed(
- Vec::from(bytes),
- ))),
- PrimitiveType::Binary =>
Ok(Literal::Primitive(PrimitiveLiteral::Binary(
- Vec::from(bytes),
- ))),
- PrimitiveType::Decimal {
- precision: _,
- scale: _,
- } => todo!(),
- },
+ Type::Primitive(primitive_type) => {
+ let datum = Datum::try_from_bytes(bytes,
primitive_type.clone())?;
+ Ok(Literal::Primitive(datum.literal))
+ }
_ => Err(Error::new(
crate::ErrorKind::DataInvalid,
"Converting bytes to non-primitive types is not supported.",
@@ -1345,7 +1571,7 @@ impl Literal {
)),
))),
(PrimitiveType::Timestamptz, JsonValue::String(s)) => {
- Ok(Some(Literal::Primitive(PrimitiveLiteral::TimestampTZ(
+ Ok(Some(Literal::Primitive(PrimitiveLiteral::Timestamptz(
timestamptz::datetimetz_to_microseconds(&Utc.from_utc_datetime(
&NaiveDateTime::parse_from_str(&s,
"%Y-%m-%dT%H:%M:%S%.f+00:00")?,
)),
@@ -1426,7 +1652,7 @@ impl Literal {
if let (Some(JsonValue::Array(keys)),
Some(JsonValue::Array(values))) =
(object.remove("keys"), object.remove("values"))
{
- Ok(Some(Literal::Map(BTreeMap::from_iter(
+ Ok(Some(Literal::Map(Map::from_iter(
keys.into_iter()
.zip(values.into_iter())
.map(|(key, value)| {
@@ -1487,7 +1713,7 @@ impl Literal {
.format("%Y-%m-%dT%H:%M:%S%.f")
.to_string(),
)),
- PrimitiveLiteral::TimestampTZ(val) => Ok(JsonValue::String(
+ PrimitiveLiteral::Timestamptz(val) => Ok(JsonValue::String(
timestamptz::microseconds_to_datetimetz(val)
.format("%Y-%m-%dT%H:%M:%S%.f+00:00")
.to_string(),
@@ -1578,7 +1804,7 @@ impl Literal {
PrimitiveLiteral::Date(any) => Box::new(any),
PrimitiveLiteral::Time(any) => Box::new(any),
PrimitiveLiteral::Timestamp(any) => Box::new(any),
- PrimitiveLiteral::TimestampTZ(any) => Box::new(any),
+ PrimitiveLiteral::Timestamptz(any) => Box::new(any),
PrimitiveLiteral::Fixed(any) => Box::new(any),
PrimitiveLiteral::Binary(any) => Box::new(any),
PrimitiveLiteral::String(any) => Box::new(any),
@@ -1667,8 +1893,6 @@ mod timestamptz {
}
mod _serde {
- use std::collections::BTreeMap;
-
use serde::{
de::Visitor,
ser::{SerializeMap, SerializeSeq, SerializeStruct},
@@ -1683,7 +1907,7 @@ mod _serde {
Error, ErrorKind,
};
- use super::{Literal, PrimitiveLiteral};
+ use super::{Literal, Map, PrimitiveLiteral};
#[derive(SerializeDerive, DeserializeDerive, Debug)]
#[serde(transparent)]
@@ -1928,7 +2152,7 @@ mod _serde {
super::PrimitiveLiteral::Date(v) => RawLiteralEnum::Int(v),
super::PrimitiveLiteral::Time(v) =>
RawLiteralEnum::Long(v),
super::PrimitiveLiteral::Timestamp(v) =>
RawLiteralEnum::Long(v),
- super::PrimitiveLiteral::TimestampTZ(v) =>
RawLiteralEnum::Long(v),
+ super::PrimitiveLiteral::Timestamptz(v) =>
RawLiteralEnum::Long(v),
super::PrimitiveLiteral::String(v) =>
RawLiteralEnum::String(v),
super::PrimitiveLiteral::UUID(v) => {
RawLiteralEnum::Bytes(ByteBuf::from(v.as_u128().to_be_bytes()))
@@ -2138,7 +2362,7 @@ mod _serde {
Type::Map(map_ty) => {
let key_ty = map_ty.key_field.field_type.as_ref();
let value_ty =
map_ty.value_field.field_type.as_ref();
- let mut map = BTreeMap::new();
+ let mut map = Map::new();
for k_v in v.list {
let k_v = k_v.ok_or_else(||
invalid_err_with_reason("list","In deserialize, None will be represented as
Some(RawLiteral::Null), all element in list must be valid"))?;
if let RawLiteralEnum::Record(Record {
@@ -2221,7 +2445,7 @@ mod _serde {
"Map key must be string",
));
}
- let mut map = BTreeMap::new();
+ let mut map = Map::new();
for (k, v) in required {
let value =
v.try_into(&map_ty.value_field.field_type)?;
if map_ty.value_field.required && value.is_none() {
@@ -2433,7 +2657,7 @@ mod tests {
check_json_serde(
record,
-
Literal::Primitive(PrimitiveLiteral::TimestampTZ(1510871468123456)),
+
Literal::Primitive(PrimitiveLiteral::Timestamptz(1510871468123456)),
&Type::Primitive(PrimitiveType::Timestamptz),
);
}
@@ -2523,7 +2747,7 @@ mod tests {
check_json_serde(
record,
- Literal::Map(BTreeMap::from([
+ Literal::Map(Map::from([
(
Literal::Primitive(PrimitiveLiteral::String("a".to_string())),
Some(Literal::Primitive(PrimitiveLiteral::Int(1))),
@@ -2683,7 +2907,7 @@ mod tests {
#[test]
fn avro_convert_test_timestamptz() {
check_convert_with_avro(
-
Literal::Primitive(PrimitiveLiteral::TimestampTZ(1510871468123456)),
+
Literal::Primitive(PrimitiveLiteral::Timestamptz(1510871468123456)),
&Type::Primitive(PrimitiveType::Timestamptz),
);
}
@@ -2724,10 +2948,48 @@ mod tests {
);
}
+ fn check_convert_with_avro_map(expected_literal: Literal, expected_type:
&Type) {
+ let fields = vec![NestedField::required(1, "col",
expected_type.clone()).into()];
+ let schema = Schema::builder()
+ .with_fields(fields.clone())
+ .build()
+ .unwrap();
+ let avro_schema = schema_to_avro_schema("test", &schema).unwrap();
+ let struct_type = Type::Struct(StructType::new(fields));
+ let struct_literal =
+
Literal::Struct(Struct::from_iter(vec![Some(expected_literal.clone())]));
+
+ let mut writer = apache_avro::Writer::new(&avro_schema, Vec::new());
+ let raw_literal = RawLiteral::try_from(struct_literal.clone(),
&struct_type).unwrap();
+ writer.append_ser(raw_literal).unwrap();
+ let encoded = writer.into_inner().unwrap();
+
+ let reader = apache_avro::Reader::new(&*encoded).unwrap();
+ for record in reader {
+ let result =
apache_avro::from_value::<RawLiteral>(&record.unwrap()).unwrap();
+ let desered_literal =
result.try_into(&struct_type).unwrap().unwrap();
+ match (&desered_literal, &struct_literal) {
+ (Literal::Struct(desered), Literal::Struct(expected)) => {
+ match (&desered.fields[0], &expected.fields[0]) {
+ (Literal::Map(desered), Literal::Map(expected)) => {
+ assert!(desered.has_same_content(expected))
+ }
+ _ => {
+ unreachable!()
+ }
+ }
+ }
+ _ => {
+ panic!("unexpected literal type");
+ }
+ }
+ }
+ }
+
#[test]
fn avro_convert_test_map() {
- check_convert_with_avro(
- Literal::Map(BTreeMap::from([
+ check_convert_with_avro_map(
+ Literal::Map(Map::from([
(
Literal::Primitive(PrimitiveLiteral::Int(1)),
Some(Literal::Primitive(PrimitiveLiteral::Long(1))),
@@ -2750,8 +3012,8 @@ mod tests {
}),
);
- check_convert_with_avro(
- Literal::Map(BTreeMap::from([
+ check_convert_with_avro_map(
+ Literal::Map(Map::from([
(
Literal::Primitive(PrimitiveLiteral::Int(1)),
Some(Literal::Primitive(PrimitiveLiteral::Long(1))),
@@ -2780,8 +3042,8 @@ mod tests {
#[test]
fn avro_convert_test_string_map() {
- check_convert_with_avro(
- Literal::Map(BTreeMap::from([
+ check_convert_with_avro_map(
+ Literal::Map(Map::from([
(
Literal::Primitive(PrimitiveLiteral::String("a".to_string())),
Some(Literal::Primitive(PrimitiveLiteral::Int(1))),
@@ -2807,8 +3069,8 @@ mod tests {
}),
);
- check_convert_with_avro(
- Literal::Map(BTreeMap::from([
+ check_convert_with_avro_map(
+ Literal::Map(Map::from([
(
Literal::Primitive(PrimitiveLiteral::String("a".to_string())),
Some(Literal::Primitive(PrimitiveLiteral::Int(1))),
diff --git a/crates/iceberg/src/transform/temporal.rs
b/crates/iceberg/src/transform/temporal.rs
index 0cbdde0..f4005b8 100644
--- a/crates/iceberg/src/transform/temporal.rs
+++ b/crates/iceberg/src/transform/temporal.rs
@@ -70,7 +70,7 @@ impl TransformFunction for Year {
let val = match input.literal() {
PrimitiveLiteral::Date(v) => Date32Type::to_naive_date(*v).year()
- UNIX_EPOCH_YEAR,
PrimitiveLiteral::Timestamp(v) => Self::timestamp_to_year(*v)?,
- PrimitiveLiteral::TimestampTZ(v) => Self::timestamp_to_year(*v)?,
+ PrimitiveLiteral::Timestamptz(v) => Self::timestamp_to_year(*v)?,
_ => {
return Err(crate::Error::new(
crate::ErrorKind::FeatureUnsupported,
@@ -142,7 +142,7 @@ impl TransformFunction for Month {
+ Date32Type::to_naive_date(*v).month0() as i32
}
PrimitiveLiteral::Timestamp(v) => Self::timestamp_to_month(*v)?,
- PrimitiveLiteral::TimestampTZ(v) => Self::timestamp_to_month(*v)?,
+ PrimitiveLiteral::Timestamptz(v) => Self::timestamp_to_month(*v)?,
_ => {
return Err(crate::Error::new(
crate::ErrorKind::FeatureUnsupported,
@@ -223,7 +223,7 @@ impl TransformFunction for Day {
let val = match input.literal() {
PrimitiveLiteral::Date(v) => *v,
PrimitiveLiteral::Timestamp(v) => Self::day_timestamp_micro(*v)?,
- PrimitiveLiteral::TimestampTZ(v) => Self::day_timestamp_micro(*v)?,
+ PrimitiveLiteral::Timestamptz(v) => Self::day_timestamp_micro(*v)?,
_ => {
return Err(crate::Error::new(
crate::ErrorKind::FeatureUnsupported,
@@ -273,7 +273,7 @@ impl TransformFunction for Hour {
fn transform_literal(&self, input: &crate::spec::Datum) ->
Result<Option<crate::spec::Datum>> {
let val = match input.literal() {
PrimitiveLiteral::Timestamp(v) => Self::hour_timestamp_micro(*v),
- PrimitiveLiteral::TimestampTZ(v) => Self::hour_timestamp_micro(*v),
+ PrimitiveLiteral::Timestamptz(v) => Self::hour_timestamp_micro(*v),
_ => {
return Err(crate::Error::new(
crate::ErrorKind::FeatureUnsupported,