rdblue commented on a change in pull request #3677:
URL: https://github.com/apache/iceberg/pull/3677#discussion_r763480075



##########
File path: python/src/iceberg/table/metadata.py
##########
@@ -0,0 +1,215 @@
+import codecs
+import json
+from typing import Any, Optional, Union
+
+import boto3
+from jsonschema import validate as validate_json
+from jsonschema.exceptions import ValidationError
+
+from iceberg.io.s3 import S3Url
+
+TABLE_METADATA_V1_SCHEMA = {
+    "$schema": "http://json-schema.org/draft-04/schema#";,
+    "type": "object",
+    "properties": {
+        "format-version": {"type": "string"},
+        "table-uuid": {"type": "string"},
+        "location": {"type": "string"},
+        "last-sequence-number": {"type": "integer"},
+        "last-updated-ms": {"type": "integer"},
+        "last-column-id": {"type": "integer"},
+        "schemas": {"type": "array", "items": {}},
+        "current-schema-id": {"type": "integer"},
+        "partition-spec": {"type": "array", "items": {}},
+        "partition-specs": {"type": "array", "items": {}},
+        "default-spec-id": {"type": "integer"},
+        "last-partition-id": {"type": "integer"},
+        "properties": {"type": "object"},
+        "current-snapshot-id": {"type": "number"},
+        "snapshots": {"type": "array", "items": {}},
+        "snapshot-log": {"type": "array", "items": {}},
+        "metadata-log": {"type": "array", "items": {}},
+        "sort-orders": {"type": "array", "items": {}},
+        "default-sort-order-id": {"type": "integer"},
+    },
+    "required": [
+        "format-version",
+        "table-uuid",
+        "location",
+        "last-sequence-number",
+        "last-updated-ms",
+        "last-column-id",
+        "schemas",
+        "current-schema-id",
+        "partition-spec",
+        "partition-specs",
+        "default-spec-id",
+        "last-partition-id",
+        "properties",
+        "current-snapshot-id",
+        "snapshots",
+        "snapshot-log",
+        "metadata-log",
+        "sort-orders",
+        "default-sort-order-id",
+    ],
+}
+TABLE_METADATA_V2_SCHEMA = {
+    "$schema": "http://json-schema.org/draft-04/schema#";,
+    "type": "object",
+    "properties": {
+        "format-version": {"type": "string"},
+        "table-uuid": {"type": "string"},
+        "location": {"type": "string"},
+        "last-sequence-number": {"type": "integer"},
+        "last-updated-ms": {"type": "integer"},
+        "last-column-id": {"type": "integer"},
+        "schemas": {"type": "array", "items": {}},
+        "current-schema-id": {"type": "integer"},
+        "partition-spec": {"type": "array", "items": {}},
+        "partition-specs": {"type": "array", "items": {}},
+        "default-spec-id": {"type": "integer"},
+        "last-partition-id": {"type": "integer"},
+        "properties": {"type": "object"},
+        "current-snapshot-id": {"type": "number"},
+        "snapshots": {"type": "array", "items": {}},
+        "snapshot-log": {"type": "array", "items": {}},
+        "metadata-log": {"type": "array", "items": {}},
+        "sort-orders": {"type": "array", "items": {}},
+        "default-sort-order-id": {"type": "integer"},
+    },
+    "required": [
+        "format-version",
+        "table-uuid",
+        "location",
+        "last-sequence-number",
+        "last-updated-ms",
+        "last-column-id",
+        "schemas",
+        "current-schema-id",
+        "partition-spec",
+        "partition-specs",
+        "default-spec-id",
+        "last-partition-id",
+        "properties",
+        "current-snapshot-id",
+        "snapshots",
+        "snapshot-log",
+        "metadata-log",
+        "sort-orders",
+        "default-sort-order-id",
+    ],
+}
+
+
+class TableMetadata:
+    """Metadata for an Iceberg table as specified in the Apache Iceberg
+    spec (https://iceberg.apache.org/spec/#iceberg-table-spec)
+
+    Args:
+      metadata (dict): Table metadata dictionary representation
+      version (str|int): The metadata spec version (1 or 2)
+    """
+
+    def __init__(self, metadata: dict, version: Union[str, int]):
+        self._version = version
+        self._metadata = metadata
+        for name, value in metadata.items():
+            setattr(self, self._clean_attribute_name(name), self._wrap(value))
+
+    def _wrap(self, value: Any):
+        """A recursive function that drills into iterable values and returns
+        nested TableMetadata instances

Review comment:
       Okay, I think I get it.
   
   I like being able to use `m.snapshot_log[0]` and similar ways to access the 
metadata. But, I'm not sure that this is changing enough that it's worth the 
wrapper approach, instead of just converting to a `TableMetadata` class that 
pulls out and stores `self.current_schema_id` (for example). Table metadata 
shouldn't be _that_ complicated since it's mostly a few lists of objects at the 
most nested level (like metadata > snapshots > snapshot > properties > 
key/value).
   
   While this makes it easy to get started, it would be awkward to make updates 
to the metadata as JSON because you'd need to produce a new JSON tree and then 
wrap with this class. We may also want classes for things like Snapshot, which 
can embed some operations like reading metadata.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to