rdblue commented on a change in pull request #3677:
URL: https://github.com/apache/iceberg/pull/3677#discussion_r763316871



##########
File path: python/src/iceberg/table/metadata.py
##########
@@ -0,0 +1,215 @@
+import codecs
+import json
+from typing import Any, Optional, Union
+
+import boto3
+from jsonschema import validate as validate_json
+from jsonschema.exceptions import ValidationError
+
+from iceberg.io.s3 import S3Url
+
+TABLE_METADATA_V1_SCHEMA = {
+    "$schema": "http://json-schema.org/draft-04/schema#";,
+    "type": "object",
+    "properties": {
+        "format-version": {"type": "string"},
+        "table-uuid": {"type": "string"},
+        "location": {"type": "string"},
+        "last-sequence-number": {"type": "integer"},
+        "last-updated-ms": {"type": "integer"},
+        "last-column-id": {"type": "integer"},
+        "schemas": {"type": "array", "items": {}},
+        "current-schema-id": {"type": "integer"},
+        "partition-spec": {"type": "array", "items": {}},
+        "partition-specs": {"type": "array", "items": {}},
+        "default-spec-id": {"type": "integer"},
+        "last-partition-id": {"type": "integer"},
+        "properties": {"type": "object"},
+        "current-snapshot-id": {"type": "number"},
+        "snapshots": {"type": "array", "items": {}},
+        "snapshot-log": {"type": "array", "items": {}},
+        "metadata-log": {"type": "array", "items": {}},
+        "sort-orders": {"type": "array", "items": {}},
+        "default-sort-order-id": {"type": "integer"},
+    },
+    "required": [
+        "format-version",
+        "table-uuid",
+        "location",
+        "last-sequence-number",
+        "last-updated-ms",
+        "last-column-id",
+        "schemas",
+        "current-schema-id",
+        "partition-spec",
+        "partition-specs",
+        "default-spec-id",
+        "last-partition-id",
+        "properties",
+        "current-snapshot-id",
+        "snapshots",
+        "snapshot-log",
+        "metadata-log",
+        "sort-orders",
+        "default-sort-order-id",
+    ],
+}
+TABLE_METADATA_V2_SCHEMA = {
+    "$schema": "http://json-schema.org/draft-04/schema#";,
+    "type": "object",
+    "properties": {
+        "format-version": {"type": "string"},
+        "table-uuid": {"type": "string"},
+        "location": {"type": "string"},
+        "last-sequence-number": {"type": "integer"},
+        "last-updated-ms": {"type": "integer"},
+        "last-column-id": {"type": "integer"},
+        "schemas": {"type": "array", "items": {}},
+        "current-schema-id": {"type": "integer"},
+        "partition-spec": {"type": "array", "items": {}},
+        "partition-specs": {"type": "array", "items": {}},
+        "default-spec-id": {"type": "integer"},
+        "last-partition-id": {"type": "integer"},
+        "properties": {"type": "object"},
+        "current-snapshot-id": {"type": "number"},
+        "snapshots": {"type": "array", "items": {}},
+        "snapshot-log": {"type": "array", "items": {}},
+        "metadata-log": {"type": "array", "items": {}},
+        "sort-orders": {"type": "array", "items": {}},
+        "default-sort-order-id": {"type": "integer"},
+    },
+    "required": [
+        "format-version",
+        "table-uuid",
+        "location",
+        "last-sequence-number",
+        "last-updated-ms",
+        "last-column-id",
+        "schemas",
+        "current-schema-id",
+        "partition-spec",
+        "partition-specs",
+        "default-spec-id",
+        "last-partition-id",
+        "properties",
+        "current-snapshot-id",
+        "snapshots",
+        "snapshot-log",
+        "metadata-log",
+        "sort-orders",
+        "default-sort-order-id",
+    ],
+}
+
+
+class TableMetadata:
+    """Metadata for an Iceberg table as specified in the Apache Iceberg
+    spec (https://iceberg.apache.org/spec/#iceberg-table-spec)
+
+    Args:
+      metadata (dict): Table metadata dictionary representation
+      version (str|int): The metadata spec version (1 or 2)
+    """
+
+    def __init__(self, metadata: dict, version: Union[str, int]):
+        self._version = version
+        self._metadata = metadata
+        for name, value in metadata.items():
+            setattr(self, self._clean_attribute_name(name), self._wrap(value))
+
+    def _wrap(self, value: Any):
+        """A recursive function that drills into iterable values and returns
+        nested TableMetadata instances
+
+        Args:
+            value: A table metadata value.
+            - If it's a string, number, or boolean, the value is returned
+            - If it's an array, the values will be iterated through and 
processed and
+                added to an array that's returned
+            - If it's an object, another TableMetadata instance is returned
+
+        """
+
+        if isinstance(value, (tuple, list, set, frozenset)):
+            return type(value)([self._wrap(v) for v in value])
+        else:
+            return (
+                TableMetadata(value, version=self._version)
+                if (value and isinstance(value, dict))
+                else value
+            )
+
+    @staticmethod
+    def _clean_attribute_name(value):
+        """Fixes attribute names to be python friendly"""
+        return value.replace("-", "_").replace(".", "_")
+
+    def validate(self):

Review comment:
       I like how this can use JSON schema to validate, but what should we do 
if validation fails? Are there some things that we can recover from? For 
example, v1 metadata with `schemas` and `current-schema-id` but not `schema` is 
technically invalid. But we can still read the table just fine.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to