Fokko commented on code in PR #25:
URL: https://github.com/apache/iceberg-rust/pull/25#discussion_r1288009695
##########
crates/iceberg/Cargo.toml:
##########
@@ -28,12 +28,14 @@ keywords = ["iceberg"]
[dependencies]
apache-avro = "0.15.0"
-serde = "^1.0"
+serde = {version = "^1.0", features = ["rc"]}
Review Comment:
I also like to live dangerous :D
##########
crates/iceberg/src/spec/datatypes.rs:
##########
@@ -438,13 +461,15 @@ impl fmt::Display for NestedField {
/// Elements can be either optional or required. Element types may be any type.
pub struct ListType {
/// Element field of list type.
- pub element_field: NestedField,
+ pub element_field: NestedFieldRef,
}
/// Module for type serialization/deserialization.
pub(super) mod _serde {
use crate::spec::datatypes::Type::Map;
- use crate::spec::datatypes::{ListType, MapType, NestedField,
PrimitiveType, StructType, Type};
+ use crate::spec::datatypes::{
+ ListType, MapType, NestedField, NestedFieldRef, PrimitiveType,
StructType, Type,
Review Comment:
Nit: Looks like there is a trailing comma now, shouldn't clippy take care of
that?
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -48,14 +68,98 @@ impl SchemaBuilder {
self
}
+ /// Set identifier field ids.
+ pub fn with_identifier_field_ids(mut self, ids: impl IntoIterator<Item =
i32>) -> Self {
+ self.identifier_field_ids.extend(ids);
+ self
+ }
+
+ /// Set alias to filed id mapping.
+ pub fn with_alias(mut self, alias_to_id: BiHashMap<String, i32>) -> Self {
+ self.alias_to_id = alias_to_id;
+ self
+ }
+
/// Builds the schema.
- pub fn build(self) -> Schema {
+ pub fn build(self) -> Result<Schema> {
let highest_field_id = self.fields.iter().map(|f|
f.id).max().unwrap_or(0);
- Schema {
- r#struct: StructType::new(self.fields),
+
+ let r#struct = StructType::new(self.fields);
+ let id_to_field = index_by_id(&r#struct)?;
+
+ Self::validate_identifier_ids(
+ &r#struct,
+ &id_to_field,
+ self.identifier_field_ids.iter().copied(),
+ )?;
+
+ let (name_to_id, id_to_name) = {
+ let mut index = IndexByName::default();
+ visit_struct(&r#struct, &mut index)?;
+ index.indexes()
+ };
+
+ Ok(Schema {
+ r#struct,
schema_id: self.schema_id,
highest_field_id,
+ identifier_field_ids: self.identifier_field_ids,
+
+ alias_to_id: self.alias_to_id,
+ id_to_field,
+
+ name_to_id,
+ id_to_name,
+ lower_case_name_to_id: OnceCell::default(),
+ })
+ }
+
+ fn validate_identifier_ids(
+ r#struct: &StructType,
+ id_to_field: &HashMap<i32, NestedFieldRef>,
+ identifier_field_ids: impl Iterator<Item = i32>,
+ ) -> Result<()> {
+ let id_to_parent = index_parents(r#struct);
+ for identifier_field_id in identifier_field_ids {
+ let field = id_to_field.get(&identifier_field_id)
+ .ok_or_else(|| Error::new(ErrorKind::DataInvalid,
format!("Cannot add fieldId {identifier_field_id} as an identifier field: field
does not exist") ))?;
Review Comment:
Suggestion:
```suggestion
.ok_or_else(|| Error::new(ErrorKind::DataInvalid,
format!("Cannot add identifier field {identifier_field_id}: field does not
exist") ))?;
```
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -84,28 +206,853 @@ impl Schema {
pub fn schema_id(&self) -> i32 {
self.schema_id
}
+
+ /// Returns [`r#struct`].
+ #[inline]
+ pub fn as_struct(&self) -> &StructType {
+ &self.r#struct
+ }
+
+ /// Get field id by full name.
+ pub fn field_id_by_name(&self, name: &str) -> Option<i32> {
+ self.name_to_id.get(name).cloned()
+ }
+
+ /// Get field id by full name.
+ pub fn name_by_field_id(&self, field_id: i32) -> Option<&str> {
+ self.id_to_name.get(&field_id).map(String::as_str)
+ }
+}
+
+impl Display for Schema {
Review Comment:
😍
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -48,14 +68,98 @@ impl SchemaBuilder {
self
}
+ /// Set identifier field ids.
+ pub fn with_identifier_field_ids(mut self, ids: impl IntoIterator<Item =
i32>) -> Self {
+ self.identifier_field_ids.extend(ids);
+ self
+ }
+
+ /// Set alias to filed id mapping.
+ pub fn with_alias(mut self, alias_to_id: BiHashMap<String, i32>) -> Self {
+ self.alias_to_id = alias_to_id;
+ self
+ }
+
/// Builds the schema.
- pub fn build(self) -> Schema {
+ pub fn build(self) -> Result<Schema> {
let highest_field_id = self.fields.iter().map(|f|
f.id).max().unwrap_or(0);
- Schema {
- r#struct: StructType::new(self.fields),
+
+ let r#struct = StructType::new(self.fields);
+ let id_to_field = index_by_id(&r#struct)?;
+
+ Self::validate_identifier_ids(
+ &r#struct,
+ &id_to_field,
+ self.identifier_field_ids.iter().copied(),
+ )?;
+
+ let (name_to_id, id_to_name) = {
+ let mut index = IndexByName::default();
+ visit_struct(&r#struct, &mut index)?;
+ index.indexes()
+ };
+
+ Ok(Schema {
+ r#struct,
schema_id: self.schema_id,
highest_field_id,
+ identifier_field_ids: self.identifier_field_ids,
+
+ alias_to_id: self.alias_to_id,
+ id_to_field,
+
+ name_to_id,
+ id_to_name,
+ lower_case_name_to_id: OnceCell::default(),
+ })
+ }
+
+ fn validate_identifier_ids(
+ r#struct: &StructType,
+ id_to_field: &HashMap<i32, NestedFieldRef>,
+ identifier_field_ids: impl Iterator<Item = i32>,
+ ) -> Result<()> {
+ let id_to_parent = index_parents(r#struct);
+ for identifier_field_id in identifier_field_ids {
+ let field = id_to_field.get(&identifier_field_id)
+ .ok_or_else(|| Error::new(ErrorKind::DataInvalid,
format!("Cannot add fieldId {identifier_field_id} as an identifier field: field
does not exist") ))?;
+ ensure_data_valid!(
+ field.required,
+ "Cannot add field {} as an identifier field: not a required
field",
Review Comment:
How about removing a negation:
```suggestion
"Cannot add identifier field: {} is an optional field",
```
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -84,28 +206,853 @@ impl Schema {
pub fn schema_id(&self) -> i32 {
self.schema_id
}
+
+ /// Returns [`r#struct`].
+ #[inline]
+ pub fn as_struct(&self) -> &StructType {
+ &self.r#struct
+ }
+
+ /// Get field id by full name.
+ pub fn field_id_by_name(&self, name: &str) -> Option<i32> {
+ self.name_to_id.get(name).cloned()
Review Comment:
Do we need to return a `cloned()` version here? Can't we make the `i32`
immutable?
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -48,14 +68,98 @@ impl SchemaBuilder {
self
}
+ /// Set identifier field ids.
+ pub fn with_identifier_field_ids(mut self, ids: impl IntoIterator<Item =
i32>) -> Self {
+ self.identifier_field_ids.extend(ids);
+ self
+ }
+
+ /// Set alias to filed id mapping.
+ pub fn with_alias(mut self, alias_to_id: BiHashMap<String, i32>) -> Self {
+ self.alias_to_id = alias_to_id;
+ self
+ }
+
/// Builds the schema.
- pub fn build(self) -> Schema {
+ pub fn build(self) -> Result<Schema> {
let highest_field_id = self.fields.iter().map(|f|
f.id).max().unwrap_or(0);
- Schema {
- r#struct: StructType::new(self.fields),
+
+ let r#struct = StructType::new(self.fields);
+ let id_to_field = index_by_id(&r#struct)?;
+
+ Self::validate_identifier_ids(
+ &r#struct,
+ &id_to_field,
+ self.identifier_field_ids.iter().copied(),
+ )?;
+
+ let (name_to_id, id_to_name) = {
+ let mut index = IndexByName::default();
+ visit_struct(&r#struct, &mut index)?;
+ index.indexes()
+ };
+
+ Ok(Schema {
+ r#struct,
schema_id: self.schema_id,
highest_field_id,
+ identifier_field_ids: self.identifier_field_ids,
+
+ alias_to_id: self.alias_to_id,
+ id_to_field,
+
+ name_to_id,
+ id_to_name,
+ lower_case_name_to_id: OnceCell::default(),
+ })
+ }
+
+ fn validate_identifier_ids(
+ r#struct: &StructType,
+ id_to_field: &HashMap<i32, NestedFieldRef>,
+ identifier_field_ids: impl Iterator<Item = i32>,
+ ) -> Result<()> {
+ let id_to_parent = index_parents(r#struct);
+ for identifier_field_id in identifier_field_ids {
+ let field = id_to_field.get(&identifier_field_id)
+ .ok_or_else(|| Error::new(ErrorKind::DataInvalid,
format!("Cannot add fieldId {identifier_field_id} as an identifier field: field
does not exist") ))?;
+ ensure_data_valid!(
+ field.required,
+ "Cannot add field {} as an identifier field: not a required
field",
+ field.name
+ );
+ if let Type::Primitive(p) = field.field_type.as_ref() {
+ ensure_data_valid!(
+ !matches!(p, PrimitiveType::Double | PrimitiveType::Float),
+ "Cannot add field {} as an identifier field: must not be
float or double field",
+ field.name
+ );
+ } else {
+ return Err(Error::new(
+ ErrorKind::DataInvalid,
+ format!(
+ "Cannot add field {} as an identifier field: not a
primitive type field",
+ field.name
+ ),
+ ));
+ }
+
+ let mut cur_field_id = identifier_field_id;
+ while let Some(parent) = id_to_parent.get(&cur_field_id) {
+ let parent_field = id_to_field.get(parent)
+ .ok_or_else(|| Error::new(ErrorKind::DataInvalid,
format!("Cannot add fieldId {identifier_field_id} as an identifier field:
parent field does not exist") ))?;
Review Comment:
I don't think this is possible.
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -48,14 +68,98 @@ impl SchemaBuilder {
self
}
+ /// Set identifier field ids.
+ pub fn with_identifier_field_ids(mut self, ids: impl IntoIterator<Item =
i32>) -> Self {
+ self.identifier_field_ids.extend(ids);
+ self
+ }
+
+ /// Set alias to filed id mapping.
+ pub fn with_alias(mut self, alias_to_id: BiHashMap<String, i32>) -> Self {
+ self.alias_to_id = alias_to_id;
+ self
+ }
+
/// Builds the schema.
- pub fn build(self) -> Schema {
+ pub fn build(self) -> Result<Schema> {
let highest_field_id = self.fields.iter().map(|f|
f.id).max().unwrap_or(0);
- Schema {
- r#struct: StructType::new(self.fields),
+
+ let r#struct = StructType::new(self.fields);
+ let id_to_field = index_by_id(&r#struct)?;
+
+ Self::validate_identifier_ids(
+ &r#struct,
+ &id_to_field,
+ self.identifier_field_ids.iter().copied(),
+ )?;
+
+ let (name_to_id, id_to_name) = {
+ let mut index = IndexByName::default();
+ visit_struct(&r#struct, &mut index)?;
+ index.indexes()
+ };
+
+ Ok(Schema {
+ r#struct,
schema_id: self.schema_id,
highest_field_id,
+ identifier_field_ids: self.identifier_field_ids,
+
+ alias_to_id: self.alias_to_id,
+ id_to_field,
+
+ name_to_id,
+ id_to_name,
+ lower_case_name_to_id: OnceCell::default(),
+ })
+ }
+
+ fn validate_identifier_ids(
+ r#struct: &StructType,
+ id_to_field: &HashMap<i32, NestedFieldRef>,
+ identifier_field_ids: impl Iterator<Item = i32>,
+ ) -> Result<()> {
+ let id_to_parent = index_parents(r#struct);
+ for identifier_field_id in identifier_field_ids {
+ let field = id_to_field.get(&identifier_field_id)
+ .ok_or_else(|| Error::new(ErrorKind::DataInvalid,
format!("Cannot add fieldId {identifier_field_id} as an identifier field: field
does not exist") ))?;
+ ensure_data_valid!(
+ field.required,
+ "Cannot add field {} as an identifier field: not a required
field",
+ field.name
+ );
+ if let Type::Primitive(p) = field.field_type.as_ref() {
+ ensure_data_valid!(
+ !matches!(p, PrimitiveType::Double | PrimitiveType::Float),
+ "Cannot add field {} as an identifier field: must not be
float or double field",
+ field.name
+ );
+ } else {
+ return Err(Error::new(
+ ErrorKind::DataInvalid,
+ format!(
+ "Cannot add field {} as an identifier field: not a
primitive type field",
+ field.name
+ ),
+ ));
+ }
+
+ let mut cur_field_id = identifier_field_id;
+ while let Some(parent) = id_to_parent.get(&cur_field_id) {
+ let parent_field = id_to_field.get(parent)
+ .ok_or_else(|| Error::new(ErrorKind::DataInvalid,
format!("Cannot add fieldId {identifier_field_id} as an identifier field:
parent field does not exist") ))?;
+ ensure_data_valid!(
+ parent_field.field_type.is_struct(),
+ "Cannot add field {} as an identifier field: must not be
nested in {:?}",
Review Comment:
Do we need a full visitor for this? We could also check if the field-id is
present in the root struct.
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -84,28 +206,853 @@ impl Schema {
pub fn schema_id(&self) -> i32 {
self.schema_id
}
+
+ /// Returns [`r#struct`].
+ #[inline]
+ pub fn as_struct(&self) -> &StructType {
+ &self.r#struct
+ }
+
+ /// Get field id by full name.
+ pub fn field_id_by_name(&self, name: &str) -> Option<i32> {
+ self.name_to_id.get(name).cloned()
+ }
+
+ /// Get field id by full name.
+ pub fn name_by_field_id(&self, field_id: i32) -> Option<&str> {
+ self.id_to_name.get(&field_id).map(String::as_str)
+ }
+}
+
+impl Display for Schema {
Review Comment:
😍
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -48,14 +68,98 @@ impl SchemaBuilder {
self
}
+ /// Set identifier field ids.
+ pub fn with_identifier_field_ids(mut self, ids: impl IntoIterator<Item =
i32>) -> Self {
+ self.identifier_field_ids.extend(ids);
+ self
+ }
+
+ /// Set alias to filed id mapping.
+ pub fn with_alias(mut self, alias_to_id: BiHashMap<String, i32>) -> Self {
+ self.alias_to_id = alias_to_id;
+ self
+ }
+
/// Builds the schema.
- pub fn build(self) -> Schema {
+ pub fn build(self) -> Result<Schema> {
let highest_field_id = self.fields.iter().map(|f|
f.id).max().unwrap_or(0);
- Schema {
- r#struct: StructType::new(self.fields),
+
+ let r#struct = StructType::new(self.fields);
+ let id_to_field = index_by_id(&r#struct)?;
+
+ Self::validate_identifier_ids(
+ &r#struct,
+ &id_to_field,
+ self.identifier_field_ids.iter().copied(),
+ )?;
+
+ let (name_to_id, id_to_name) = {
+ let mut index = IndexByName::default();
+ visit_struct(&r#struct, &mut index)?;
+ index.indexes()
+ };
+
+ Ok(Schema {
+ r#struct,
schema_id: self.schema_id,
highest_field_id,
+ identifier_field_ids: self.identifier_field_ids,
+
+ alias_to_id: self.alias_to_id,
+ id_to_field,
+
+ name_to_id,
+ id_to_name,
+ lower_case_name_to_id: OnceCell::default(),
+ })
+ }
+
+ fn validate_identifier_ids(
+ r#struct: &StructType,
+ id_to_field: &HashMap<i32, NestedFieldRef>,
+ identifier_field_ids: impl Iterator<Item = i32>,
+ ) -> Result<()> {
+ let id_to_parent = index_parents(r#struct);
+ for identifier_field_id in identifier_field_ids {
+ let field = id_to_field.get(&identifier_field_id)
+ .ok_or_else(|| Error::new(ErrorKind::DataInvalid,
format!("Cannot add fieldId {identifier_field_id} as an identifier field: field
does not exist") ))?;
+ ensure_data_valid!(
+ field.required,
+ "Cannot add field {} as an identifier field: not a required
field",
+ field.name
+ );
+ if let Type::Primitive(p) = field.field_type.as_ref() {
+ ensure_data_valid!(
+ !matches!(p, PrimitiveType::Double | PrimitiveType::Float),
+ "Cannot add field {} as an identifier field: must not be
float or double field",
Review Comment:
```suggestion
"Cannot add identifier field {}: cannot be a float or
double type",
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]