This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new cdc3e54a Disallow writing empty Manifest files (#876)
cdc3e54a is described below
commit cdc3e54aad23638a3411fb4a771a2901e5bf8a93
Author: Fokko Driesprong <[email protected]>
AuthorDate: Tue Jul 9 08:28:27 2024 +0200
Disallow writing empty Manifest files (#876)
* Disallow writing empty Avro files/blocks
Raising an exception when doing this might look extreme, but
there is no real good reason to allow this.
* Relax the constaints a bit
---
pyiceberg/manifest.py | 6 ++++++
tests/utils/test_manifest.py | 19 ++++++++++++++++++-
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py
index bf5749ce..4fd82fec 100644
--- a/pyiceberg/manifest.py
+++ b/pyiceberg/manifest.py
@@ -685,6 +685,10 @@ class ManifestWriter(ABC):
traceback: Optional[TracebackType],
) -> None:
"""Close the writer."""
+ if (self._added_files + self._existing_files + self._deleted_files) ==
0:
+ # This is just a guard to ensure that we don't write empty
manifest files
+ raise ValueError("An empty manifest file has been written")
+
self.closed = True
self._writer.__exit__(exc_type, exc_value, traceback)
@@ -757,6 +761,8 @@ class ManifestWriter(ABC):
elif entry.status == ManifestEntryStatus.DELETED:
self._deleted_files += 1
self._deleted_rows += entry.data_file.record_count
+ else:
+ raise ValueError(f"Unknown entry: {entry.status}")
self._partitions.append(entry.data_file.partition)
diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py
index a812b384..ecb99e28 100644
--- a/tests/utils/test_manifest.py
+++ b/tests/utils/test_manifest.py
@@ -35,7 +35,7 @@ from pyiceberg.manifest import (
write_manifest,
write_manifest_list,
)
-from pyiceberg.partitioning import PartitionField, PartitionSpec
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC,
PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table.snapshots import Operation, Snapshot, Summary
from pyiceberg.transforms import IdentityTransform
@@ -306,6 +306,23 @@ def test_read_manifest_v2(generated_manifest_file_file_v2:
str) -> None:
assert entry.status == ManifestEntryStatus.ADDED
+def test_write_empty_manifest() -> None:
+ io = load_file_io()
+ test_schema = Schema(NestedField(1, "foo", IntegerType(), False))
+ with TemporaryDirectory() as tmpdir:
+ tmp_avro_file = tmpdir + "/test_write_manifest.avro"
+
+ with pytest.raises(ValueError, match="An empty manifest file has been
written"):
+ with write_manifest(
+ format_version=1,
+ spec=UNPARTITIONED_PARTITION_SPEC,
+ schema=test_schema,
+ output_file=io.new_output(tmp_avro_file),
+ snapshot_id=8744736658442914487,
+ ) as _:
+ pass
+
+
@pytest.mark.parametrize("format_version", [1, 2])
def test_write_manifest(
generated_manifest_file_file_v1: str, generated_manifest_file_file_v2:
str, format_version: TableVersion