samredai commented on a change in pull request #3677:
URL: https://github.com/apache/iceberg/pull/3677#discussion_r763408253
##########
File path: python/src/iceberg/table/metadata.py
##########
@@ -0,0 +1,215 @@
+import codecs
+import json
+from typing import Any, Optional, Union
+
+import boto3
+from jsonschema import validate as validate_json
+from jsonschema.exceptions import ValidationError
+
+from iceberg.io.s3 import S3Url
+
+TABLE_METADATA_V1_SCHEMA = {
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "type": "object",
+ "properties": {
+ "format-version": {"type": "string"},
+ "table-uuid": {"type": "string"},
+ "location": {"type": "string"},
+ "last-sequence-number": {"type": "integer"},
+ "last-updated-ms": {"type": "integer"},
+ "last-column-id": {"type": "integer"},
+ "schemas": {"type": "array", "items": {}},
+ "current-schema-id": {"type": "integer"},
+ "partition-spec": {"type": "array", "items": {}},
+ "partition-specs": {"type": "array", "items": {}},
+ "default-spec-id": {"type": "integer"},
+ "last-partition-id": {"type": "integer"},
+ "properties": {"type": "object"},
+ "current-snapshot-id": {"type": "number"},
+ "snapshots": {"type": "array", "items": {}},
+ "snapshot-log": {"type": "array", "items": {}},
+ "metadata-log": {"type": "array", "items": {}},
+ "sort-orders": {"type": "array", "items": {}},
+ "default-sort-order-id": {"type": "integer"},
+ },
+ "required": [
+ "format-version",
+ "table-uuid",
+ "location",
+ "last-sequence-number",
+ "last-updated-ms",
+ "last-column-id",
+ "schemas",
+ "current-schema-id",
+ "partition-spec",
+ "partition-specs",
+ "default-spec-id",
+ "last-partition-id",
+ "properties",
+ "current-snapshot-id",
+ "snapshots",
+ "snapshot-log",
+ "metadata-log",
+ "sort-orders",
+ "default-sort-order-id",
+ ],
+}
+TABLE_METADATA_V2_SCHEMA = {
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "type": "object",
+ "properties": {
+ "format-version": {"type": "string"},
+ "table-uuid": {"type": "string"},
+ "location": {"type": "string"},
+ "last-sequence-number": {"type": "integer"},
+ "last-updated-ms": {"type": "integer"},
+ "last-column-id": {"type": "integer"},
+ "schemas": {"type": "array", "items": {}},
+ "current-schema-id": {"type": "integer"},
+ "partition-spec": {"type": "array", "items": {}},
+ "partition-specs": {"type": "array", "items": {}},
+ "default-spec-id": {"type": "integer"},
+ "last-partition-id": {"type": "integer"},
+ "properties": {"type": "object"},
+ "current-snapshot-id": {"type": "number"},
+ "snapshots": {"type": "array", "items": {}},
+ "snapshot-log": {"type": "array", "items": {}},
+ "metadata-log": {"type": "array", "items": {}},
+ "sort-orders": {"type": "array", "items": {}},
+ "default-sort-order-id": {"type": "integer"},
+ },
+ "required": [
+ "format-version",
+ "table-uuid",
+ "location",
+ "last-sequence-number",
+ "last-updated-ms",
+ "last-column-id",
+ "schemas",
+ "current-schema-id",
+ "partition-spec",
+ "partition-specs",
+ "default-spec-id",
+ "last-partition-id",
+ "properties",
+ "current-snapshot-id",
+ "snapshots",
+ "snapshot-log",
+ "metadata-log",
+ "sort-orders",
+ "default-sort-order-id",
+ ],
+}
+
+
+class TableMetadata:
+ """Metadata for an Iceberg table as specified in the Apache Iceberg
+ spec (https://iceberg.apache.org/spec/#iceberg-table-spec)
+
+ Args:
+ metadata (dict): Table metadata dictionary representation
+ version (str|int): The metadata spec version (1 or 2)
+ """
+
+ def __init__(self, metadata: dict, version: Union[str, int]):
+ self._version = version
+ self._metadata = metadata
+ for name, value in metadata.items():
+ setattr(self, self._clean_attribute_name(name), self._wrap(value))
+
+ def _wrap(self, value: Any):
+ """A recursive function that drills into iterable values and returns
+ nested TableMetadata instances
Review comment:
The class name does make this sound odd but the logic is essentially a
DFS through the metadata to assign everything as class attributes. If a value
in the json is an object (like properties), it instantiates that as a sort of
"partial" `TableMetadata` instance and then starts a new traversal through the
contents of that object. In other words
```py
table_metadata = TableMetadata(...)
isinstance(table_metadata, TableMetadata) # True
isinstance(table_metadata.properties, TableMetadata) # True
isinstance(table_metadata.snapshot_log, list) # True
isinstance(table_metadata.snapshot_log[0], TableMetadata) # True
```
If the idea of this partial metadata existing as a `TableMetadata` instance
doesn't sit well, I could instead have a generic `Metadata` or `Config` class
that actually traverses the json object to create a class with all of the class
attributes, and have that as an argument to a `TableMetadata` class that
handles validation and other table metadata related things.
Something like:
```py
config = Config({"table-uuid": "foo", ...})
table_metadata = TableMetadata(config, version="2")
```
The `_wrap()` method would live in the `Config` class and the docstring
would read as:
> A recursive function that drills into iterable values and returns
nested Config instances
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]