This is an automated email from the ASF dual-hosted git repository.
honahx pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 8cdf4abd 🐛 Write fields instead of spec object (#846)
8cdf4abd is described below
commit 8cdf4abdc5e4779ff888c62041f027bb3309d4c5
Author: Fokko Driesprong <[email protected]>
AuthorDate: Mon Jun 24 08:47:35 2024 +0200
🐛 Write fields instead of spec object (#846)
---
pyiceberg/manifest.py | 46 ++++++++++++++++++++------------------------
tests/utils/test_manifest.py | 4 ++--
2 files changed, 23 insertions(+), 27 deletions(-)
diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py
index defe5958..bf5749ce 100644
--- a/pyiceberg/manifest.py
+++ b/pyiceberg/manifest.py
@@ -31,13 +31,15 @@ from typing import (
Type,
)
+from pydantic_core import to_json
+
from pyiceberg.avro.file import AvroFile, AvroOutputFile
from pyiceberg.conversions import to_bytes
from pyiceberg.exceptions import ValidationError
from pyiceberg.io import FileIO, InputFile, OutputFile
from pyiceberg.partitioning import PartitionSpec
from pyiceberg.schema import Schema
-from pyiceberg.typedef import EMPTY_DICT, Record, TableVersion
+from pyiceberg.typedef import Record, TableVersion
from pyiceberg.types import (
BinaryType,
BooleanType,
@@ -645,7 +647,6 @@ class ManifestWriter(ABC):
_output_file: OutputFile
_writer: AvroOutputFile[ManifestEntry]
_snapshot_id: int
- _meta: Dict[str, str]
_added_files: int
_added_rows: int
_existing_files: int
@@ -655,15 +656,12 @@ class ManifestWriter(ABC):
_min_data_sequence_number: Optional[int]
_partitions: List[Record]
- def __init__(
- self, spec: PartitionSpec, schema: Schema, output_file: OutputFile,
snapshot_id: int, meta: Dict[str, str] = EMPTY_DICT
- ) -> None:
+ def __init__(self, spec: PartitionSpec, schema: Schema, output_file:
OutputFile, snapshot_id: int) -> None:
self.closed = False
self._spec = spec
self._schema = schema
self._output_file = output_file
self._snapshot_id = snapshot_id
- self._meta = meta
self._added_files = 0
self._added_rows = 0
@@ -697,6 +695,15 @@ class ManifestWriter(ABC):
@abstractmethod
def version(self) -> TableVersion: ...
+ @property
+ def _meta(self) -> Dict[str, str]:
+ return {
+ "schema": self._schema.model_dump_json(),
+ "partition-spec": to_json(self._spec.fields).decode("utf-8"),
+ "partition-spec-id": str(self._spec.spec_id),
+ "format-version": str(self.version),
+ }
+
def _with_partition(self, format_version: TableVersion) -> Schema:
data_file_type = data_file_with_partition(
format_version=format_version,
partition_type=self._spec.partition_type(self._schema)
@@ -771,12 +778,6 @@ class ManifestWriterV1(ManifestWriter):
schema,
output_file,
snapshot_id,
- {
- "schema": schema.model_dump_json(),
- "partition-spec": spec.model_dump_json(),
- "partition-spec-id": str(spec.spec_id),
- "format-version": "1",
- },
)
def content(self) -> ManifestContent:
@@ -792,19 +793,7 @@ class ManifestWriterV1(ManifestWriter):
class ManifestWriterV2(ManifestWriter):
def __init__(self, spec: PartitionSpec, schema: Schema, output_file:
OutputFile, snapshot_id: int):
- super().__init__(
- spec,
- schema,
- output_file,
- snapshot_id,
- meta={
- "schema": schema.model_dump_json(),
- "partition-spec": spec.model_dump_json(),
- "partition-spec-id": str(spec.spec_id),
- "format-version": "2",
- "content": "data",
- },
- )
+ super().__init__(spec, schema, output_file, snapshot_id)
def content(self) -> ManifestContent:
return ManifestContent.DATA
@@ -813,6 +802,13 @@ class ManifestWriterV2(ManifestWriter):
def version(self) -> TableVersion:
return 2
+ @property
+ def _meta(self) -> Dict[str, str]:
+ return {
+ **super()._meta,
+ "content": "data",
+ }
+
def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:
if entry.data_sequence_number is None:
if entry.snapshot_id is not None and entry.snapshot_id !=
self._snapshot_id:
diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py
index 8bb03cd8..a812b384 100644
--- a/tests/utils/test_manifest.py
+++ b/tests/utils/test_manifest.py
@@ -348,8 +348,8 @@ def test_write_manifest(
expected_metadata = {
"schema": test_schema.model_dump_json(),
- "partition-spec": test_spec.model_dump_json(),
- "partition-spec-id": str(test_spec.spec_id),
+ "partition-spec":
"""[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""",
+ "partition-spec-id": str(demo_manifest_file.partition_spec_id),
"format-version": str(format_version),
}
_verify_metadata_with_fastavro(