liurenjie1024 commented on code in PR #29:
URL: https://github.com/apache/iceberg-rust/pull/29#discussion_r1290871871
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -609,13 +611,88 @@ impl SchemaVisitor for IndexByName {
}
}
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+/// Names and types of fields in a table.
+pub(crate) struct SchemaV2 {
Review Comment:
Remove `pub(crate)`? I think it's private to schema mod.
##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -609,13 +611,88 @@ impl SchemaVisitor for IndexByName {
}
}
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+/// Names and types of fields in a table.
+pub(crate) struct SchemaV2 {
+ /// Identifier of the schema
+ pub schema_id: i32,
+ /// Set of primitive fields that identify rows in a table.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub identifier_field_ids: Option<Vec<i32>>,
+
+ #[serde(flatten)]
+ /// The struct fields
+ pub fields: StructType,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+/// Names and types of fields in a table.
+pub(crate) struct SchemaV1 {
Review Comment:
Same as above.
##########
crates/iceberg/src/spec/snapshot.rs:
##########
@@ -0,0 +1,316 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*!
+ * Snapshots
+*/
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+use super::table_metadata::SnapshotLog;
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "lowercase")]
+/// The operation field is used by some operations, like snapshot expiration,
to skip processing certain snapshots.
+pub enum Operation {
+ /// Only data files were added and no files were removed.
+ Append,
+ /// Data and delete files were added and removed without changing table
data;
+ /// i.e., compaction, changing the data file format, or relocating data
files.
+ Replace,
+ /// Data and delete files were added and removed in a logical overwrite
operation.
+ Overwrite,
+ /// Data files were removed and their contents logically deleted and/or
delete files were added to delete rows.
+ Delete,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+/// Summarises the changes in the snapshot.
+pub struct Summary {
+ /// The type of operation in the snapshot
+ pub operation: Operation,
+ /// Other summary data.
+ #[serde(flatten)]
+ pub other: HashMap<String, String>,
+}
+
+impl Default for Operation {
+ fn default() -> Operation {
+ Self::Append
+ }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone)]
+/// A snapshot represents the state of a table at some time and is used to
access the complete set of data files in the table.
+pub struct Snapshot {
+ /// A unique long ID
+ snapshot_id: i64,
+ /// The snapshot ID of the snapshot’s parent.
+ /// Omitted for any snapshot with no parent
+ parent_snapshot_id: Option<i64>,
+ /// A monotonically increasing long that tracks the order of
+ /// changes to a table.
+ sequence_number: i64,
+ /// A timestamp when the snapshot was created, used for garbage
+ /// collection and table inspection
+ timestamp_ms: i64,
+ /// The location of a manifest list for this snapshot that
+ /// tracks manifest files with additional metadata.
+ manifest_list: String,
+ /// A list of manifest file locations. Must be omitted if manifest-list is
present
+ manifests: Option<Vec<String>>,
Review Comment:
How about make this an enum?
```rust
enum ManifestList {
ManifestListFile(String),
ManifestFielst(Vec<String>)
}
```
##########
crates/iceberg/src/spec/table_metadata.rs:
##########
@@ -0,0 +1,859 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*!
+Defines the [table metadata](https://iceberg.apache.org/spec/#table-metadata).
+The main struct here is [TableMetadataV2] which defines the data for a table.
+*/
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+use serde_repr::{Deserialize_repr, Serialize_repr};
+use uuid::Uuid;
+
+use crate::{Error, ErrorKind};
+
+use super::{
+ partition::{PartitionField, PartitionSpec},
+ schema::{self, Schema},
+ snapshot::{Reference, Retention, Snapshot, SnapshotV1, SnapshotV2},
+ sort::SortOrder,
+};
+
+static MAIN_BRANCH: &str = "main";
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Eq, Clone)]
+#[serde(try_from = "TableMetadataEnum", into = "TableMetadataEnum")]
+/// Fields for the version 2 of the table metadata.
+pub struct TableMetadata {
+ /// Integer Version for the format.
+ format_version: FormatVersion,
+ /// A UUID that identifies the table
+ table_uuid: Uuid,
+ /// Location tables base location
+ location: String,
+ /// The tables highest sequence number
+ last_sequence_number: i64,
+ /// Timestamp in milliseconds from the unix epoch when the table was last
updated.
+ last_updated_ms: i64,
+ /// An integer; the highest assigned column ID for the table.
+ last_column_id: i32,
+ /// A list of schemas, stored as objects with schema-id.
+ schemas: HashMap<i32, Schema>,
+ /// ID of the table’s current schema.
+ current_schema_id: i32,
+ /// A list of partition specs, stored as full partition spec objects.
+ partition_specs: HashMap<i32, PartitionSpec>,
+ /// ID of the “current” spec that writers should use by default.
+ default_spec_id: i32,
+ /// An integer; the highest assigned partition field ID across all
partition specs for the table.
+ last_partition_id: i32,
+ ///A string to string map of table properties. This is used to control
settings that
+ /// affect reading and writing and is not intended to be used for
arbitrary metadata.
+ /// For example, commit.retry.num-retries is used to control the number of
commit retries.
+ properties: Option<HashMap<String, String>>,
+ /// long ID of the current table snapshot; must be the same as the current
+ /// ID of the main branch in refs.
+ current_snapshot_id: Option<i64>,
+ ///A list of valid snapshots. Valid snapshots are snapshots for which all
+ /// data files exist in the file system. A data file must not be deleted
+ /// from the file system until the last snapshot in which it was listed is
+ /// garbage collected.
+ snapshots: Option<HashMap<i64, Snapshot>>,
+ /// A list (optional) of timestamp and snapshot ID pairs that encodes
changes
+ /// to the current snapshot for the table. Each time the
current-snapshot-id
+ /// is changed, a new entry should be added with the last-updated-ms
+ /// and the new current-snapshot-id. When snapshots are expired from
+ /// the list of valid snapshots, all entries before a snapshot that has
+ /// expired should be removed.
+ snapshot_log: Option<Vec<SnapshotLog>>,
Review Comment:
Use `Vec<SnapshotLog>` directly?
##########
crates/iceberg/src/spec/snapshot.rs:
##########
@@ -0,0 +1,316 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*!
+ * Snapshots
+*/
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+use super::table_metadata::SnapshotLog;
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "lowercase")]
+/// The operation field is used by some operations, like snapshot expiration,
to skip processing certain snapshots.
+pub enum Operation {
+ /// Only data files were added and no files were removed.
+ Append,
+ /// Data and delete files were added and removed without changing table
data;
+ /// i.e., compaction, changing the data file format, or relocating data
files.
+ Replace,
+ /// Data and delete files were added and removed in a logical overwrite
operation.
+ Overwrite,
+ /// Data files were removed and their contents logically deleted and/or
delete files were added to delete rows.
+ Delete,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+/// Summarises the changes in the snapshot.
+pub struct Summary {
+ /// The type of operation in the snapshot
+ pub operation: Operation,
+ /// Other summary data.
+ #[serde(flatten)]
+ pub other: HashMap<String, String>,
+}
+
+impl Default for Operation {
+ fn default() -> Operation {
+ Self::Append
+ }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone)]
+/// A snapshot represents the state of a table at some time and is used to
access the complete set of data files in the table.
+pub struct Snapshot {
+ /// A unique long ID
+ snapshot_id: i64,
+ /// The snapshot ID of the snapshot’s parent.
+ /// Omitted for any snapshot with no parent
+ parent_snapshot_id: Option<i64>,
+ /// A monotonically increasing long that tracks the order of
+ /// changes to a table.
+ sequence_number: i64,
+ /// A timestamp when the snapshot was created, used for garbage
+ /// collection and table inspection
+ timestamp_ms: i64,
+ /// The location of a manifest list for this snapshot that
+ /// tracks manifest files with additional metadata.
+ manifest_list: String,
+ /// A list of manifest file locations. Must be omitted if manifest-list is
present
+ manifests: Option<Vec<String>>,
+ /// A string map that summarizes the snapshot changes, including operation.
+ summary: Summary,
+ /// ID of the table’s current schema when the snapshot was created.
+ schema_id: Option<i64>,
+}
+
+impl Snapshot {
+ /// Get the id of the snapshot
+ #[inline]
+ pub fn snapshot_id(&self) -> i64 {
+ self.snapshot_id
+ }
+ /// Get sequence_number of the snapshot. Is 0 for Iceberg V1 tables.
+ #[inline]
+ pub fn sequence_number(&self) -> i64 {
+ self.sequence_number
+ }
+ /// Get location of manifest_list file
+ #[inline]
+ pub fn manifest_list(&self) -> &str {
+ &self.manifest_list
+ }
+ /// Get summary of the snapshot
+ #[inline]
+ pub fn summary(&self) -> &Summary {
+ &self.summary
+ }
+ /// Get the timestamp of when the snapshot was created
+ #[inline]
+ pub fn timestamp(&self) -> i64 {
+ self.timestamp_ms
+ }
+
+ pub(crate) fn log(&self) -> SnapshotLog {
+ SnapshotLog {
+ timestamp_ms: self.timestamp_ms,
+ snapshot_id: self.snapshot_id,
+ }
+ }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+/// A snapshot represents the state of a table at some time and is used to
access the complete set of data files in the table.
+pub(crate) struct SnapshotV2 {
+ /// A unique long ID
+ pub snapshot_id: i64,
+ /// The snapshot ID of the snapshot’s parent.
+ /// Omitted for any snapshot with no parent
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub parent_snapshot_id: Option<i64>,
+ /// A monotonically increasing long that tracks the order of
+ /// changes to a table.
+ pub sequence_number: i64,
+ /// A timestamp when the snapshot was created, used for garbage
+ /// collection and table inspection
+ pub timestamp_ms: i64,
+ /// The location of a manifest list for this snapshot that
+ /// tracks manifest files with additional metadata.
+ pub manifest_list: String,
+ /// A string map that summarizes the snapshot changes, including operation.
+ pub summary: Summary,
+ /// ID of the table’s current schema when the snapshot was created.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub schema_id: Option<i64>,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+/// A snapshot represents the state of a table at some time and is used to
access the complete set of data files in the table.
+pub(crate) struct SnapshotV1 {
+ /// A unique long ID
+ pub snapshot_id: i64,
+ /// The snapshot ID of the snapshot’s parent.
+ /// Omitted for any snapshot with no parent
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub parent_snapshot_id: Option<i64>,
+ /// A timestamp when the snapshot was created, used for garbage
+ /// collection and table inspection
+ pub timestamp_ms: i64,
+ /// The location of a manifest list for this snapshot that
+ /// tracks manifest files with additional metadata.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub manifest_list: Option<String>,
+ /// A list of manifest file locations. Must be omitted if manifest-list is
present
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub manifests: Option<Vec<String>>,
+ /// A string map that summarizes the snapshot changes, including operation.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub summary: Option<Summary>,
+ /// ID of the table’s current schema when the snapshot was created.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub schema_id: Option<i64>,
+}
+
+impl From<SnapshotV2> for Snapshot {
+ fn from(v2: SnapshotV2) -> Self {
+ Snapshot {
+ snapshot_id: v2.snapshot_id,
+ parent_snapshot_id: v2.parent_snapshot_id,
+ sequence_number: v2.sequence_number,
+ timestamp_ms: v2.timestamp_ms,
+ manifest_list: v2.manifest_list,
+ manifests: None,
+ summary: v2.summary,
+ schema_id: v2.schema_id,
+ }
+ }
+}
+
+impl From<Snapshot> for SnapshotV2 {
+ fn from(v2: Snapshot) -> Self {
+ SnapshotV2 {
+ snapshot_id: v2.snapshot_id,
+ parent_snapshot_id: v2.parent_snapshot_id,
+ sequence_number: v2.sequence_number,
+ timestamp_ms: v2.timestamp_ms,
+ manifest_list: v2.manifest_list,
+ summary: v2.summary,
+ schema_id: v2.schema_id,
+ }
+ }
+}
+
+impl From<SnapshotV1> for Snapshot {
+ fn from(v1: SnapshotV1) -> Self {
+ Snapshot {
+ snapshot_id: v1.snapshot_id,
+ parent_snapshot_id: v1.parent_snapshot_id,
+ sequence_number: 0,
+ timestamp_ms: v1.timestamp_ms,
+ manifest_list: v1.manifest_list.unwrap_or_default(),
+ manifests: v1.manifests,
+ summary: v1.summary.unwrap_or(Summary {
+ operation: Operation::default(),
+ other: HashMap::new(),
+ }),
+ schema_id: v1.schema_id,
+ }
+ }
+}
+
+impl From<Snapshot> for SnapshotV1 {
+ fn from(v2: Snapshot) -> Self {
+ SnapshotV1 {
+ snapshot_id: v2.snapshot_id,
+ parent_snapshot_id: v2.parent_snapshot_id,
+ timestamp_ms: v2.timestamp_ms,
+ manifest_list: Some(v2.manifest_list),
+ manifests: v2.manifests,
+ summary: Some(v2.summary),
+ schema_id: v2.schema_id,
+ }
+ }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+/// Iceberg tables keep track of branches and tags using snapshot references.
+pub struct Reference {
Review Comment:
```suggestion
pub struct SnapshotReference {
```
##########
crates/iceberg/src/spec/snapshot.rs:
##########
@@ -0,0 +1,316 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*!
+ * Snapshots
+*/
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+use super::table_metadata::SnapshotLog;
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "lowercase")]
+/// The operation field is used by some operations, like snapshot expiration,
to skip processing certain snapshots.
+pub enum Operation {
+ /// Only data files were added and no files were removed.
+ Append,
+ /// Data and delete files were added and removed without changing table
data;
+ /// i.e., compaction, changing the data file format, or relocating data
files.
+ Replace,
+ /// Data and delete files were added and removed in a logical overwrite
operation.
+ Overwrite,
+ /// Data files were removed and their contents logically deleted and/or
delete files were added to delete rows.
+ Delete,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+/// Summarises the changes in the snapshot.
+pub struct Summary {
+ /// The type of operation in the snapshot
+ pub operation: Operation,
+ /// Other summary data.
+ #[serde(flatten)]
+ pub other: HashMap<String, String>,
+}
+
+impl Default for Operation {
+ fn default() -> Operation {
+ Self::Append
+ }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone)]
+/// A snapshot represents the state of a table at some time and is used to
access the complete set of data files in the table.
+pub struct Snapshot {
+ /// A unique long ID
+ snapshot_id: i64,
+ /// The snapshot ID of the snapshot’s parent.
+ /// Omitted for any snapshot with no parent
+ parent_snapshot_id: Option<i64>,
+ /// A monotonically increasing long that tracks the order of
+ /// changes to a table.
+ sequence_number: i64,
+ /// A timestamp when the snapshot was created, used for garbage
+ /// collection and table inspection
+ timestamp_ms: i64,
+ /// The location of a manifest list for this snapshot that
+ /// tracks manifest files with additional metadata.
+ manifest_list: String,
+ /// A list of manifest file locations. Must be omitted if manifest-list is
present
+ manifests: Option<Vec<String>>,
+ /// A string map that summarizes the snapshot changes, including operation.
+ summary: Summary,
+ /// ID of the table’s current schema when the snapshot was created.
+ schema_id: Option<i64>,
+}
+
+impl Snapshot {
+ /// Get the id of the snapshot
+ #[inline]
+ pub fn snapshot_id(&self) -> i64 {
+ self.snapshot_id
+ }
+ /// Get sequence_number of the snapshot. Is 0 for Iceberg V1 tables.
+ #[inline]
+ pub fn sequence_number(&self) -> i64 {
+ self.sequence_number
+ }
+ /// Get location of manifest_list file
+ #[inline]
+ pub fn manifest_list(&self) -> &str {
+ &self.manifest_list
+ }
+ /// Get summary of the snapshot
+ #[inline]
+ pub fn summary(&self) -> &Summary {
+ &self.summary
+ }
+ /// Get the timestamp of when the snapshot was created
+ #[inline]
+ pub fn timestamp(&self) -> i64 {
+ self.timestamp_ms
+ }
+
+ pub(crate) fn log(&self) -> SnapshotLog {
+ SnapshotLog {
+ timestamp_ms: self.timestamp_ms,
+ snapshot_id: self.snapshot_id,
+ }
+ }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+/// A snapshot represents the state of a table at some time and is used to
access the complete set of data files in the table.
+pub(crate) struct SnapshotV2 {
+ /// A unique long ID
+ pub snapshot_id: i64,
+ /// The snapshot ID of the snapshot’s parent.
+ /// Omitted for any snapshot with no parent
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub parent_snapshot_id: Option<i64>,
+ /// A monotonically increasing long that tracks the order of
+ /// changes to a table.
+ pub sequence_number: i64,
+ /// A timestamp when the snapshot was created, used for garbage
+ /// collection and table inspection
+ pub timestamp_ms: i64,
+ /// The location of a manifest list for this snapshot that
+ /// tracks manifest files with additional metadata.
+ pub manifest_list: String,
+ /// A string map that summarizes the snapshot changes, including operation.
+ pub summary: Summary,
+ /// ID of the table’s current schema when the snapshot was created.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub schema_id: Option<i64>,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+/// A snapshot represents the state of a table at some time and is used to
access the complete set of data files in the table.
+pub(crate) struct SnapshotV1 {
+ /// A unique long ID
+ pub snapshot_id: i64,
+ /// The snapshot ID of the snapshot’s parent.
+ /// Omitted for any snapshot with no parent
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub parent_snapshot_id: Option<i64>,
+ /// A timestamp when the snapshot was created, used for garbage
+ /// collection and table inspection
+ pub timestamp_ms: i64,
+ /// The location of a manifest list for this snapshot that
+ /// tracks manifest files with additional metadata.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub manifest_list: Option<String>,
+ /// A list of manifest file locations. Must be omitted if manifest-list is
present
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub manifests: Option<Vec<String>>,
+ /// A string map that summarizes the snapshot changes, including operation.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub summary: Option<Summary>,
+ /// ID of the table’s current schema when the snapshot was created.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub schema_id: Option<i64>,
+}
+
+impl From<SnapshotV2> for Snapshot {
+ fn from(v2: SnapshotV2) -> Self {
+ Snapshot {
+ snapshot_id: v2.snapshot_id,
+ parent_snapshot_id: v2.parent_snapshot_id,
+ sequence_number: v2.sequence_number,
+ timestamp_ms: v2.timestamp_ms,
+ manifest_list: v2.manifest_list,
+ manifests: None,
+ summary: v2.summary,
+ schema_id: v2.schema_id,
+ }
+ }
+}
+
+impl From<Snapshot> for SnapshotV2 {
+ fn from(v2: Snapshot) -> Self {
+ SnapshotV2 {
+ snapshot_id: v2.snapshot_id,
+ parent_snapshot_id: v2.parent_snapshot_id,
+ sequence_number: v2.sequence_number,
+ timestamp_ms: v2.timestamp_ms,
+ manifest_list: v2.manifest_list,
+ summary: v2.summary,
+ schema_id: v2.schema_id,
+ }
+ }
+}
+
+impl From<SnapshotV1> for Snapshot {
+ fn from(v1: SnapshotV1) -> Self {
+ Snapshot {
+ snapshot_id: v1.snapshot_id,
+ parent_snapshot_id: v1.parent_snapshot_id,
+ sequence_number: 0,
+ timestamp_ms: v1.timestamp_ms,
+ manifest_list: v1.manifest_list.unwrap_or_default(),
+ manifests: v1.manifests,
+ summary: v1.summary.unwrap_or(Summary {
+ operation: Operation::default(),
+ other: HashMap::new(),
+ }),
+ schema_id: v1.schema_id,
+ }
+ }
+}
+
+impl From<Snapshot> for SnapshotV1 {
+ fn from(v2: Snapshot) -> Self {
+ SnapshotV1 {
+ snapshot_id: v2.snapshot_id,
+ parent_snapshot_id: v2.parent_snapshot_id,
+ timestamp_ms: v2.timestamp_ms,
+ manifest_list: Some(v2.manifest_list),
+ manifests: v2.manifests,
+ summary: Some(v2.summary),
+ schema_id: v2.schema_id,
+ }
+ }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "kebab-case")]
+/// Iceberg tables keep track of branches and tags using snapshot references.
+pub struct Reference {
+ /// A reference’s snapshot ID. The tagged snapshot or latest snapshot of a
branch.
+ pub snapshot_id: i64,
+ #[serde(flatten)]
+ /// Snapshot retention policy
+ pub retention: Retention,
+}
+
+impl Reference {
+ /// Create new snapshot reference
+ pub fn new(snapshot_id: i64, retention: Retention) -> Self {
+ Reference {
+ snapshot_id,
+ retention,
+ }
+ }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[serde(rename_all = "lowercase", tag = "type")]
+/// The snapshot expiration procedure removes snapshots from table metadata
and applies the table’s retention policy.
+pub enum Retention {
Review Comment:
```suggestion
pub enum SnapshotRetention {
```
##########
crates/iceberg/src/spec/table_metadata.rs:
##########
@@ -0,0 +1,859 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*!
+Defines the [table metadata](https://iceberg.apache.org/spec/#table-metadata).
+The main struct here is [TableMetadataV2] which defines the data for a table.
+*/
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+use serde_repr::{Deserialize_repr, Serialize_repr};
+use uuid::Uuid;
+
+use crate::{Error, ErrorKind};
+
+use super::{
+ partition::{PartitionField, PartitionSpec},
+ schema::{self, Schema},
+ snapshot::{Reference, Retention, Snapshot, SnapshotV1, SnapshotV2},
+ sort::SortOrder,
+};
+
+static MAIN_BRANCH: &str = "main";
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Eq, Clone)]
+#[serde(try_from = "TableMetadataEnum", into = "TableMetadataEnum")]
+/// Fields for the version 2 of the table metadata.
+pub struct TableMetadata {
+ /// Integer Version for the format.
+ format_version: FormatVersion,
+ /// A UUID that identifies the table
+ table_uuid: Uuid,
+ /// Location tables base location
+ location: String,
+ /// The tables highest sequence number
+ last_sequence_number: i64,
+ /// Timestamp in milliseconds from the unix epoch when the table was last
updated.
+ last_updated_ms: i64,
+ /// An integer; the highest assigned column ID for the table.
+ last_column_id: i32,
+ /// A list of schemas, stored as objects with schema-id.
+ schemas: HashMap<i32, Schema>,
+ /// ID of the table’s current schema.
+ current_schema_id: i32,
+ /// A list of partition specs, stored as full partition spec objects.
+ partition_specs: HashMap<i32, PartitionSpec>,
+ /// ID of the “current” spec that writers should use by default.
+ default_spec_id: i32,
+ /// An integer; the highest assigned partition field ID across all
partition specs for the table.
+ last_partition_id: i32,
+ ///A string to string map of table properties. This is used to control
settings that
+ /// affect reading and writing and is not intended to be used for
arbitrary metadata.
+ /// For example, commit.retry.num-retries is used to control the number of
commit retries.
+ properties: Option<HashMap<String, String>>,
+ /// long ID of the current table snapshot; must be the same as the current
+ /// ID of the main branch in refs.
+ current_snapshot_id: Option<i64>,
+ ///A list of valid snapshots. Valid snapshots are snapshots for which all
+ /// data files exist in the file system. A data file must not be deleted
+ /// from the file system until the last snapshot in which it was listed is
+ /// garbage collected.
+ snapshots: Option<HashMap<i64, Snapshot>>,
+ /// A list (optional) of timestamp and snapshot ID pairs that encodes
changes
+ /// to the current snapshot for the table. Each time the
current-snapshot-id
+ /// is changed, a new entry should be added with the last-updated-ms
+ /// and the new current-snapshot-id. When snapshots are expired from
+ /// the list of valid snapshots, all entries before a snapshot that has
+ /// expired should be removed.
+ snapshot_log: Option<Vec<SnapshotLog>>,
+
+ /// A list (optional) of timestamp and metadata file location pairs
+ /// that encodes changes to the previous metadata files for the table.
+ /// Each time a new metadata file is created, a new entry of the
+ /// previous metadata file location should be added to the list.
+ /// Tables can be configured to remove oldest metadata log entries and
+ /// keep a fixed-size log of the most recent entries after a commit.
+ metadata_log: Option<Vec<MetadataLog>>,
+
+ /// A list of sort orders, stored as full sort order objects.
+ sort_orders: HashMap<i64, SortOrder>,
+ /// Default sort order id of the table. Note that this could be used by
+ /// writers, but is not used when reading because reads use the specs
+ /// stored in manifest files.
+ default_sort_order_id: i64,
+ ///A map of snapshot references. The map keys are the unique snapshot
reference
+ /// names in the table, and the map values are snapshot reference objects.
+ /// There is always a main branch reference pointing to the
current-snapshot-id
+ /// even if the refs map is null.
+ refs: HashMap<String, Reference>,
+}
+
+impl TableMetadata {
+ /// Get current schema
+ #[inline]
+ pub fn current_schema(&self) -> Result<&Schema, Error> {
+ self.schemas.get(&self.current_schema_id).ok_or_else(|| {
+ Error::new(
+ ErrorKind::DataInvalid,
+ format!("Schema id {} not found!", self.current_schema_id),
+ )
+ })
+ }
+ /// Get default partition spec
+ #[inline]
+ pub fn default_partition_spec(&self) -> Result<&PartitionSpec, Error> {
+ self.partition_specs
+ .get(&self.default_spec_id)
+ .ok_or_else(|| {
+ Error::new(
+ ErrorKind::DataInvalid,
+ format!("Partition spec id {} not found!",
self.default_spec_id),
+ )
+ })
+ }
+
+ /// Get current snapshot
+ #[inline]
+ pub fn current_snapshot(&self) -> Result<&Snapshot, Error> {
+ let snapshot_id = self.current_snapshot_id.ok_or_else(|| {
+ Error::new(
+ ErrorKind::DataInvalid,
+ "Table snapshots are missing!".to_string(),
+ )
+ })?;
+ self.snapshots
+ .as_ref()
+ .ok_or_else(|| {
+ Error::new(
+ ErrorKind::DataInvalid,
+ "Table snapshots are missing!".to_string(),
+ )
+ })?
+ .get(&snapshot_id)
+ .ok_or_else(|| {
+ Error::new(
+ ErrorKind::DataInvalid,
+ format!("Partition spec id {} not found!", snapshot_id),
+ )
+ })
+ }
+
+ /// Append snapshot to table
+ pub fn append_snapshot(&mut self, snapshot: Snapshot) -> Result<(), Error>
{
+ self.last_updated_ms = snapshot.timestamp();
+ self.last_sequence_number = snapshot.sequence_number();
+
+ self.refs
+ .entry(MAIN_BRANCH.to_string())
+ .and_modify(|s| {
+ s.snapshot_id = snapshot.snapshot_id();
+ s.retention = Retention::Branch {
+ min_snapshots_to_keep: None,
Review Comment:
We should keep current branch's retention policy?
##########
crates/iceberg/src/spec/mod.rs:
##########
@@ -17,7 +17,20 @@
//! Spec for Iceberg.
-pub mod datatypes;
-pub mod schema;
-pub mod transform;
-pub mod values;
+mod datatypes;
+mod partition;
+mod schema;
+mod snapshot;
+mod sort;
+mod table_metadata;
+mod transform;
+mod values;
+
+pub use datatypes::{ListType, MapType, NestedField, PrimitiveType, StructType,
Type};
+pub use partition::{PartitionField, PartitionSpec};
+pub use schema::{visit_schema, visit_struct, visit_type, Schema,
SchemaVisitor};
+pub use snapshot::{Operation, Reference, Snapshot, Summary};
+pub use sort::{SortField, SortOrder};
+pub use table_metadata::TableMetadata;
+pub use transform::Transform;
+pub use values::{Literal, PrimitiveLiteral, Struct};
Review Comment:
How about simply make it "pub use schema::*; pub use partition::*;"
##########
crates/iceberg/src/spec/table_metadata.rs:
##########
@@ -0,0 +1,859 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*!
+Defines the [table metadata](https://iceberg.apache.org/spec/#table-metadata).
+The main struct here is [TableMetadataV2] which defines the data for a table.
+*/
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+use serde_repr::{Deserialize_repr, Serialize_repr};
+use uuid::Uuid;
+
+use crate::{Error, ErrorKind};
+
+use super::{
+ partition::{PartitionField, PartitionSpec},
+ schema::{self, Schema},
+ snapshot::{Reference, Retention, Snapshot, SnapshotV1, SnapshotV2},
+ sort::SortOrder,
+};
+
+static MAIN_BRANCH: &str = "main";
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Eq, Clone)]
+#[serde(try_from = "TableMetadataEnum", into = "TableMetadataEnum")]
+/// Fields for the version 2 of the table metadata.
+pub struct TableMetadata {
+ /// Integer Version for the format.
+ format_version: FormatVersion,
+ /// A UUID that identifies the table
+ table_uuid: Uuid,
+ /// Location tables base location
+ location: String,
+ /// The tables highest sequence number
+ last_sequence_number: i64,
+ /// Timestamp in milliseconds from the unix epoch when the table was last
updated.
+ last_updated_ms: i64,
+ /// An integer; the highest assigned column ID for the table.
+ last_column_id: i32,
+ /// A list of schemas, stored as objects with schema-id.
+ schemas: HashMap<i32, Schema>,
+ /// ID of the table’s current schema.
+ current_schema_id: i32,
+ /// A list of partition specs, stored as full partition spec objects.
+ partition_specs: HashMap<i32, PartitionSpec>,
+ /// ID of the “current” spec that writers should use by default.
+ default_spec_id: i32,
+ /// An integer; the highest assigned partition field ID across all
partition specs for the table.
+ last_partition_id: i32,
+ ///A string to string map of table properties. This is used to control
settings that
+ /// affect reading and writing and is not intended to be used for
arbitrary metadata.
+ /// For example, commit.retry.num-retries is used to control the number of
commit retries.
+ properties: Option<HashMap<String, String>>,
Review Comment:
I think `HashMap<String, String>` would be enough? We don't need None for
empty properties.
##########
crates/iceberg/src/spec/table_metadata.rs:
##########
@@ -0,0 +1,859 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*!
+Defines the [table metadata](https://iceberg.apache.org/spec/#table-metadata).
+The main struct here is [TableMetadataV2] which defines the data for a table.
+*/
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+use serde_repr::{Deserialize_repr, Serialize_repr};
+use uuid::Uuid;
+
+use crate::{Error, ErrorKind};
+
+use super::{
+ partition::{PartitionField, PartitionSpec},
+ schema::{self, Schema},
+ snapshot::{Reference, Retention, Snapshot, SnapshotV1, SnapshotV2},
+ sort::SortOrder,
+};
+
+static MAIN_BRANCH: &str = "main";
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Eq, Clone)]
+#[serde(try_from = "TableMetadataEnum", into = "TableMetadataEnum")]
+/// Fields for the version 2 of the table metadata.
+pub struct TableMetadata {
+ /// Integer Version for the format.
+ format_version: FormatVersion,
+ /// A UUID that identifies the table
+ table_uuid: Uuid,
+ /// Location tables base location
+ location: String,
+ /// The tables highest sequence number
+ last_sequence_number: i64,
+ /// Timestamp in milliseconds from the unix epoch when the table was last
updated.
+ last_updated_ms: i64,
+ /// An integer; the highest assigned column ID for the table.
+ last_column_id: i32,
+ /// A list of schemas, stored as objects with schema-id.
+ schemas: HashMap<i32, Schema>,
+ /// ID of the table’s current schema.
+ current_schema_id: i32,
+ /// A list of partition specs, stored as full partition spec objects.
+ partition_specs: HashMap<i32, PartitionSpec>,
+ /// ID of the “current” spec that writers should use by default.
+ default_spec_id: i32,
+ /// An integer; the highest assigned partition field ID across all
partition specs for the table.
+ last_partition_id: i32,
+ ///A string to string map of table properties. This is used to control
settings that
+ /// affect reading and writing and is not intended to be used for
arbitrary metadata.
+ /// For example, commit.retry.num-retries is used to control the number of
commit retries.
+ properties: Option<HashMap<String, String>>,
+ /// long ID of the current table snapshot; must be the same as the current
+ /// ID of the main branch in refs.
+ current_snapshot_id: Option<i64>,
+ ///A list of valid snapshots. Valid snapshots are snapshots for which all
+ /// data files exist in the file system. A data file must not be deleted
+ /// from the file system until the last snapshot in which it was listed is
+ /// garbage collected.
+ snapshots: Option<HashMap<i64, Snapshot>>,
Review Comment:
For large data structures like `Snapshot`, `Schema`, I would suggest to
store their pointers, e.g. `Arc<Snapshot>` to avoid unnecessary cloning and
sending to other threads.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]