tusharchou commented on code in PR #3049:
URL: https://github.com/apache/iceberg-python/pull/3049#discussion_r2818036432


##########
pyiceberg/table/update/validate.py:
##########
@@ -235,3 +298,60 @@ def _validate_added_data_files(
     if any(conflicting_entries):
         conflicting_snapshots = {entry.snapshot_id for entry in 
conflicting_entries if entry.snapshot_id is not None}
         raise ValidationException(f"Added data files were found matching the 
filter for snapshots {conflicting_snapshots}!")
+
+
+def _validate_no_new_delete_files(
+    table: Table,
+    starting_snapshot: Snapshot,
+    data_filter: BooleanExpression | None,
+    partition_set: dict[int, set[Record]] | None,
+    parent_snapshot: Snapshot | None,
+) -> None:
+    """Validate no new delete files matching a filter have been added to the 
table since starting a snapshot.
+
+    Args:
+        table: Table to validate
+        starting_snapshot: Snapshot current at the start of the operation
+        data_filter: Expression used to find added data files
+        partition_set: Dictionary of partition spec to set of partition records
+        parent_snapshot: Ending snapshot on the branch being validated
+    """
+    deletes = _added_delete_files(table, starting_snapshot, data_filter, 
partition_set, parent_snapshot)
+
+    if deletes.is_empty():
+        return
+
+    conflicting_delete_paths = [file.file_path for file in 
deletes.referenced_delete_files()]
+    raise ValidationException(
+        f"Found new conflicting delete files that can apply to records 
matching {data_filter}: {conflicting_delete_paths}"
+    )
+
+
+def _validate_no_new_deletes_for_data_files(
+    table: Table,
+    starting_snapshot: Snapshot,
+    data_filter: BooleanExpression | None,
+    data_files: set[DataFile],
+    parent_snapshot: Snapshot | None,
+) -> None:
+    """Validate no new delete files must be applied for data files that have 
been added to the table since a starting snapshot.
+
+    Args:
+        table: Table to validate
+        starting_snapshot: Snapshot current at the start of the operation
+        data_filter: Expression used to find added data files
+        data_files: data files to validate have no new deletes
+        parent_snapshot: Ending snapshot on the branch being validated
+    """
+    # If there is no current state, or no files has been added
+    if parent_snapshot is None or table.format_version < 2:

Review Comment:
   I've verified this logic locally by mocking a concurrent conflict. The 
table.format_version < 2 guard correctly prevents unnecessary overhead for V1 
tables, and the use of the DeleteFileIndex ensures we are only blocking commits 
when there is a real overlap in data files (avoiding 'lazy' global locks).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to