Fokko commented on code in PR #29: URL: https://github.com/apache/iceberg-rust/pull/29#discussion_r1297010076
########## crates/iceberg/src/spec/snapshot.rs: ########## @@ -0,0 +1,343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/*! + * Snapshots +*/ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use super::table_metadata::SnapshotLog; + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(rename_all = "lowercase")] +/// The operation field is used by some operations, like snapshot expiration, to skip processing certain snapshots. +pub enum Operation { + /// Only data files were added and no files were removed. + Append, + /// Data and delete files were added and removed without changing table data; + /// i.e., compaction, changing the data file format, or relocating data files. + Replace, + /// Data and delete files were added and removed in a logical overwrite operation. + Overwrite, + /// Data files were removed and their contents logically deleted and/or delete files were added to delete rows. + Delete, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +/// Summarises the changes in the snapshot. +pub struct Summary { + /// The type of operation in the snapshot + pub operation: Operation, + /// Other summary data. + #[serde(flatten)] + pub other: HashMap<String, String>, +} + +impl Default for Operation { + fn default() -> Operation { + Self::Append + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Builder)] +#[builder(setter(prefix = "with"))] +/// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table. +pub struct Snapshot { + /// A unique long ID + snapshot_id: i64, + /// The snapshot ID of the snapshot’s parent. + /// Omitted for any snapshot with no parent + #[builder(default = "None")] + parent_snapshot_id: Option<i64>, + /// A monotonically increasing long that tracks the order of + /// changes to a table. + sequence_number: i64, + /// A timestamp when the snapshot was created, used for garbage + /// collection and table inspection + timestamp_ms: i64, + /// The location of a manifest list for this snapshot that + /// tracks manifest files with additional metadata. + manifest_list: ManifestList, + /// A string map that summarizes the snapshot changes, including operation. + summary: Summary, + /// ID of the table’s current schema when the snapshot was created. + #[builder(setter(strip_option))] + schema_id: Option<i64>, +} + +/// Type to distinguish between a path to a manifestlist file or a vector of manifestfile locations +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(untagged)] +pub enum ManifestList { + /// Location of manifestlist file + ManifestListFile(String), + /// Manifestfile locations + ManifestFiles(Vec<String>), +} + +impl Snapshot { + /// Get the id of the snapshot + #[inline] + pub fn snapshot_id(&self) -> i64 { + self.snapshot_id + } + /// Get sequence_number of the snapshot. Is 0 for Iceberg V1 tables. + #[inline] + pub fn sequence_number(&self) -> i64 { + self.sequence_number + } + /// Get location of manifest_list file + #[inline] + pub fn manifest_list(&self) -> &ManifestList { + &self.manifest_list + } + /// Get summary of the snapshot + #[inline] + pub fn summary(&self) -> &Summary { + &self.summary + } + /// Get the timestamp of when the snapshot was created + #[inline] + pub fn timestamp(&self) -> i64 { + self.timestamp_ms + } + /// Create snapshot builder + pub fn builder() -> SnapshotBuilder { + SnapshotBuilder::default() + } + + pub(crate) fn log(&self) -> SnapshotLog { + SnapshotLog { + timestamp_ms: self.timestamp_ms, + snapshot_id: self.snapshot_id, + } + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "kebab-case")] +/// A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table. +pub(crate) struct SnapshotV2 { Review Comment: Again, I would combine the V1 and V2. The `sequence_number` is added later on, and there is some logic to set it afterward: ```python def _inherit_sequence_number(entry: ManifestEntry, manifest: ManifestFile) -> ManifestEntry: """Inherits the sequence numbers. More information in the spec: https://iceberg.apache.org/spec/#sequence-number-inheritance Args: entry: The manifest entry that has null sequence numbers. manifest: The manifest that has a sequence number. Returns: The manifest entry with the sequence numbers set. """ # The snapshot_id is required in V1, inherit with V2 when null if entry.snapshot_id is None: entry.snapshot_id = manifest.added_snapshot_id # in v1 tables, the data sequence number is not persisted and can be safely defaulted to 0 # in v2 tables, the data sequence number should be inherited iff the entry status is ADDED if entry.data_sequence_number is None and (manifest.sequence_number == 0 or entry.status == ManifestEntryStatus.ADDED): entry.data_sequence_number = manifest.sequence_number # in v1 tables, the file sequence number is not persisted and can be safely defaulted to 0 # in v2 tables, the file sequence number should be inherited iff the entry status is ADDED if entry.file_sequence_number is None and (manifest.sequence_number == 0 or entry.status == ManifestEntryStatus.ADDED): # Only available in V2, always 0 in V1 entry.file_sequence_number = manifest.sequence_number return entry ``` This can happen when deserializing the JSON, or later on (like we do in PyIceberg). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
