kevinjqliu commented on code in PR #2143: URL: https://github.com/apache/iceberg-python/pull/2143#discussion_r2265420806
########## pyiceberg/table/update/snapshot.py: ########## @@ -920,87 +919,49 @@ class ExpireSnapshots(UpdateTableMetadata["ExpireSnapshots"]): _requirements: Tuple[TableRequirement, ...] = () def _commit(self) -> UpdatesAndRequirements: - """ - Commit the staged updates and requirements. + """Commit the staged updates and requirements. - This will remove the snapshots with the given IDs, but will always skip protected snapshots (branch/tag heads). + This will remove the snapshots with the given IDs. Returns: Tuple of updates and requirements to be committed, as required by the calling parent apply functions. """ - # Remove any protected snapshot IDs from the set to expire, just in case - protected_ids = self._get_protected_snapshot_ids() - self._snapshot_ids_to_expire -= protected_ids update = RemoveSnapshotsUpdate(snapshot_ids=self._snapshot_ids_to_expire) - self._updates += (update,) - return self._updates, self._requirements Review Comment: i think we should keep the original implementation for this function ########## mkdocs/docs/api.md: ########## @@ -1287,6 +1334,67 @@ with table.manage_snapshots() as ms: ms.create_branch(snapshot_id1, "Branch_A").create_tag(snapshot_id2, "tag789") ``` +## Table Maintenance Review Comment: this section is great. lets keep this. can we revert all the other unrelated changes? ########## pyiceberg/table/inspect.py: ########## Review Comment: ``` tests/integration/test_inspect_table.py:984: error: "InspectTable" has no attribute "all_files"; maybe "_files"? [attr-defined] ``` is due to the changes in this PR, because the `_all_files` function is removed. you can run `git checkout main pyiceberg/table/inspect.py` to put it back and then commit ########## pyiceberg/table/update/snapshot.py: ########## @@ -920,87 +919,49 @@ class ExpireSnapshots(UpdateTableMetadata["ExpireSnapshots"]): _requirements: Tuple[TableRequirement, ...] = () def _commit(self) -> UpdatesAndRequirements: - """ - Commit the staged updates and requirements. + """Commit the staged updates and requirements. - This will remove the snapshots with the given IDs, but will always skip protected snapshots (branch/tag heads). + This will remove the snapshots with the given IDs. Returns: Tuple of updates and requirements to be committed, as required by the calling parent apply functions. """ - # Remove any protected snapshot IDs from the set to expire, just in case - protected_ids = self._get_protected_snapshot_ids() - self._snapshot_ids_to_expire -= protected_ids update = RemoveSnapshotsUpdate(snapshot_ids=self._snapshot_ids_to_expire) - self._updates += (update,) - return self._updates, self._requirements + return (update,), () def _get_protected_snapshot_ids(self) -> Set[int]: - """ - Get the IDs of protected snapshots. + """Get the IDs of protected snapshots. - These are the HEAD snapshots of all branches and all tagged snapshots. These ids are to be excluded from expiration. + These are the HEAD snapshots of all branches and all tagged snapshots. + These ids are to be excluded from expiration. Returns: Set of protected snapshot IDs to exclude from expiration. """ - protected_ids: Set[int] = set() + protected_ids = set() for ref in self._transaction.table_metadata.refs.values(): if ref.snapshot_ref_type in [SnapshotRefType.TAG, SnapshotRefType.BRANCH]: protected_ids.add(ref.snapshot_id) return protected_ids - def expire_snapshot_by_id(self, snapshot_id: int) -> ExpireSnapshots: - """ - Expire a snapshot by its ID. - - This will mark the snapshot for expiration. + def by_id(self, snapshot_id: int) -> ExpireSnapshots: + """Expire a snapshot by its ID. Args: snapshot_id (int): The ID of the snapshot to expire. + Returns: This for method chaining. """ if self._transaction.table_metadata.snapshot_by_id(snapshot_id) is None: raise ValueError(f"Snapshot with ID {snapshot_id} does not exist.") - if snapshot_id in self._get_protected_snapshot_ids(): - raise ValueError(f"Snapshot with ID {snapshot_id} is protected and cannot be expired.") + protected_ids = self._get_protected_snapshot_ids() + if snapshot_id in protected_ids: + raise ValueError(f"Cannot expire snapshot {snapshot_id} as it is referenced by a branch or tag.") self._snapshot_ids_to_expire.add(snapshot_id) - - return self - - def expire_snapshots_by_ids(self, snapshot_ids: List[int]) -> "ExpireSnapshots": - """ - Expire multiple snapshots by their IDs. - - This will mark the snapshots for expiration. - - Args: - snapshot_ids (List[int]): List of snapshot IDs to expire. - Returns: - This for method chaining. - """ - for snapshot_id in snapshot_ids: - self.expire_snapshot_by_id(snapshot_id) - return self - - def expire_snapshots_older_than(self, timestamp_ms: int) -> "ExpireSnapshots": Review Comment: maybe keep this as rename as `by_ids` and `older_than`? ########## tests/table/test_maintenance_api.py: ########## Review Comment: this test file isnt really testing the expire snapshots functionality. wdyt about removing it and reusing the previous `tests/table/test_expire_snapshots.py` file? we can just change any reference to `.expire_snapshots()` to `maintenance.expire_snapshots()` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org