rdblue commented on code in PR #6525:
URL: https://github.com/apache/iceberg/pull/6525#discussion_r1064184537
##########
python/pyiceberg/manifest.py:
##########
@@ -97,48 +178,82 @@ class DataFile(IcebergBaseModel):
sort_order_id: Optional[int] = Field()
-class ManifestEntry(IcebergBaseModel):
+MANIFEST_ENTRY_SCHEMA = Schema(
+ NestedField(0, "status", IntegerType(), required=True),
+ NestedField(1, "snapshot_id", LongType(), required=False),
+ NestedField(3, "sequence_number", LongType(), required=False),
+ NestedField(4, "file_sequence_number", LongType(), required=False),
+ NestedField(2, "data_file", DATA_FILE, required=False),
+)
+
+
+class ManifestEntry(PydanticStruct):
status: ManifestEntryStatus = Field()
snapshot_id: Optional[int] = Field()
sequence_number: Optional[int] = Field()
+ file_sequence_number: Optional[int] = Field()
data_file: DataFile = Field()
-class PartitionFieldSummary(IcebergBaseModel):
+PARTITION_FIELD_SUMMARY_TYPE = StructType(
+ NestedField(509, "contains_null", BooleanType(), required=True),
+ NestedField(518, "contains_nan", BooleanType(), required=False),
+ NestedField(510, "lower_bound", BinaryType(), required=False),
+ NestedField(511, "upper_bound", BinaryType(), required=False),
+)
+
+
+class PartitionFieldSummary(PydanticStruct):
contains_null: bool = Field()
contains_nan: Optional[bool] = Field()
lower_bound: Optional[bytes] = Field()
upper_bound: Optional[bytes] = Field()
-class ManifestFile(IcebergBaseModel):
+MANIFEST_FILE_SCHEMA: Schema = Schema(
+ NestedField(500, "manifest_path", StringType(), required=True,
doc="Location URI with FS scheme"),
+ NestedField(501, "manifest_length", LongType(), required=True),
+ NestedField(502, "partition_spec_id", IntegerType(), required=True),
+ NestedField(517, "content", IntegerType(), required=False),
+ NestedField(515, "sequence_number", LongType(), required=False),
+ NestedField(516, "min_sequence_number", LongType(), required=False),
+ NestedField(503, "added_snapshot_id", LongType(), required=False),
+ NestedField(504, "added_files_count", IntegerType(), required=False),
+ NestedField(505, "existing_files_count", IntegerType(), required=False),
+ NestedField(506, "deleted_files_count", IntegerType(), required=False),
+ NestedField(512, "added_rows_count", LongType(), required=False),
+ NestedField(513, "existing_rows_count", LongType(), required=False),
+ NestedField(514, "deleted_rows_count", LongType(), required=False),
+ NestedField(507, "partitions", ListType(508, PARTITION_FIELD_SUMMARY_TYPE,
element_required=True), required=False),
+ NestedField(519, "key_metadata", BinaryType(), required=False),
+)
+
+
+class ManifestFile(PydanticStruct):
manifest_path: str = Field()
manifest_length: int = Field()
partition_spec_id: int = Field()
content: ManifestContent = Field(default=ManifestContent.DATA)
- sequence_number: int = Field(default=0)
- min_sequence_number: int = Field(default=0)
+ sequence_number: Optional[int] = Field()
+ min_sequence_number: Optional[int] = Field()
added_snapshot_id: Optional[int] = Field()
- added_data_files_count: Optional[int] = Field()
- existing_data_files_count: Optional[int] = Field()
- deleted_data_files_count: Optional[int] = Field()
+ added_files_count: Optional[int] = Field()
+ existing_files_count: Optional[int] = Field()
+ deleted_files_count: Optional[int] = Field()
added_rows_count: Optional[int] = Field()
- existing_rows_counts: Optional[int] = Field()
+ existing_rows_count: Optional[int] = Field()
deleted_rows_count: Optional[int] = Field()
- partitions: Optional[List[PartitionFieldSummary]] = Field()
+ partitions: List[PartitionFieldSummary] = Field(default_factory=list)
Review Comment:
We probably want to use `None` as the default. If this is defined, it should
have a `PartitionFieldSummary` for every partition field in the partition spec
associated with the manifest file (that is, identified by `partition_spec_id`).
If we have a non-null list then we could introduce cases where we blindly look
up an index when this wasn't present.
I guess this is mitigated because `if partitions` will return `False` for an
empty list, but it seems like using `None` is more clear.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]