samredai commented on a change in pull request #3677:
URL: https://github.com/apache/iceberg/pull/3677#discussion_r765236691



##########
File path: python/src/iceberg/table/metadata.py
##########
@@ -0,0 +1,215 @@
+import codecs
+import json
+from typing import Any, Optional, Union
+
+import boto3
+from jsonschema import validate as validate_json
+from jsonschema.exceptions import ValidationError
+
+from iceberg.io.s3 import S3Url
+
+TABLE_METADATA_V1_SCHEMA = {
+    "$schema": "http://json-schema.org/draft-04/schema#";,
+    "type": "object",
+    "properties": {
+        "format-version": {"type": "string"},
+        "table-uuid": {"type": "string"},
+        "location": {"type": "string"},
+        "last-sequence-number": {"type": "integer"},
+        "last-updated-ms": {"type": "integer"},
+        "last-column-id": {"type": "integer"},
+        "schemas": {"type": "array", "items": {}},
+        "current-schema-id": {"type": "integer"},
+        "partition-spec": {"type": "array", "items": {}},
+        "partition-specs": {"type": "array", "items": {}},
+        "default-spec-id": {"type": "integer"},
+        "last-partition-id": {"type": "integer"},
+        "properties": {"type": "object"},
+        "current-snapshot-id": {"type": "number"},
+        "snapshots": {"type": "array", "items": {}},
+        "snapshot-log": {"type": "array", "items": {}},
+        "metadata-log": {"type": "array", "items": {}},
+        "sort-orders": {"type": "array", "items": {}},
+        "default-sort-order-id": {"type": "integer"},
+    },
+    "required": [
+        "format-version",
+        "table-uuid",
+        "location",
+        "last-sequence-number",
+        "last-updated-ms",
+        "last-column-id",
+        "schemas",
+        "current-schema-id",
+        "partition-spec",
+        "partition-specs",
+        "default-spec-id",
+        "last-partition-id",
+        "properties",
+        "current-snapshot-id",
+        "snapshots",
+        "snapshot-log",
+        "metadata-log",
+        "sort-orders",
+        "default-sort-order-id",
+    ],
+}
+TABLE_METADATA_V2_SCHEMA = {
+    "$schema": "http://json-schema.org/draft-04/schema#";,
+    "type": "object",
+    "properties": {
+        "format-version": {"type": "string"},
+        "table-uuid": {"type": "string"},
+        "location": {"type": "string"},
+        "last-sequence-number": {"type": "integer"},
+        "last-updated-ms": {"type": "integer"},
+        "last-column-id": {"type": "integer"},
+        "schemas": {"type": "array", "items": {}},
+        "current-schema-id": {"type": "integer"},
+        "partition-spec": {"type": "array", "items": {}},
+        "partition-specs": {"type": "array", "items": {}},
+        "default-spec-id": {"type": "integer"},
+        "last-partition-id": {"type": "integer"},
+        "properties": {"type": "object"},
+        "current-snapshot-id": {"type": "number"},
+        "snapshots": {"type": "array", "items": {}},
+        "snapshot-log": {"type": "array", "items": {}},
+        "metadata-log": {"type": "array", "items": {}},
+        "sort-orders": {"type": "array", "items": {}},
+        "default-sort-order-id": {"type": "integer"},
+    },
+    "required": [
+        "format-version",
+        "table-uuid",
+        "location",
+        "last-sequence-number",
+        "last-updated-ms",
+        "last-column-id",
+        "schemas",
+        "current-schema-id",
+        "partition-spec",
+        "partition-specs",
+        "default-spec-id",
+        "last-partition-id",
+        "properties",
+        "current-snapshot-id",
+        "snapshots",
+        "snapshot-log",
+        "metadata-log",
+        "sort-orders",
+        "default-sort-order-id",
+    ],
+}
+
+
+class TableMetadata:
+    """Metadata for an Iceberg table as specified in the Apache Iceberg
+    spec (https://iceberg.apache.org/spec/#iceberg-table-spec)
+
+    Args:
+      metadata (dict): Table metadata dictionary representation
+      version (str|int): The metadata spec version (1 or 2)
+    """
+
+    def __init__(self, metadata: dict, version: Union[str, int]):
+        self._version = version
+        self._metadata = metadata
+        for name, value in metadata.items():
+            setattr(self, self._clean_attribute_name(name), self._wrap(value))
+
+    def _wrap(self, value: Any):
+        """A recursive function that drills into iterable values and returns
+        nested TableMetadata instances
+
+        Args:
+            value: A table metadata value.
+            - If it's a string, number, or boolean, the value is returned
+            - If it's an array, the values will be iterated through and 
processed and
+                added to an array that's returned
+            - If it's an object, another TableMetadata instance is returned
+
+        """
+
+        if isinstance(value, (tuple, list, set, frozenset)):
+            return type(value)([self._wrap(v) for v in value])
+        else:
+            return (
+                TableMetadata(value, version=self._version)
+                if (value and isinstance(value, dict))
+                else value
+            )
+
+    @staticmethod
+    def _clean_attribute_name(value):
+        """Fixes attribute names to be python friendly"""
+        return value.replace("-", "_").replace(".", "_")
+
+    def validate(self):
+        """Checks that the table metadata object is valid. The validation 
schema
+        used depends on the Iceberg table metadata version."""
+        casted_version = int(self._version)
+        if casted_version == 1:
+            self.validate_v1(self._metadata)
+        elif casted_version == 2:
+            self.validate_v2(self._metadata)
+        else:
+            raise ValueError(f"Unknown table metadata version {self._version}")
+
+    @staticmethod
+    def validate_v1(metadata: dict):
+        """Perform a JSONSchema validation using the v1 Iceberg table metadata 
schema"""
+        try:
+            validate_json(instance=metadata, schema=TABLE_METADATA_V1_SCHEMA)
+        except ValidationError as e:
+            # TODO Log something here
+            raise (e)
+
+    @staticmethod
+    def validate_v2(metadata: dict):
+        """Perform a JSONSchema validation using the v2 Iceberg table metadata 
schema"""
+        try:
+            validate_json(instance=metadata, schema=TABLE_METADATA_V2_SCHEMA)
+        except ValidationError as e:
+            # TODO Log something here
+            raise (e)
+
+    @classmethod
+    def from_byte_stream(cls, byte_stream, version: Union[str, int], 
encoding="utf-8"):
+        """Instantiate a TableMetadata object from a byte stream
+
+        Args:
+            byte_stream: A file-like byte stream object
+            version: The Iceberg table metadata version (1 or 2)
+            encoding (default "utf-8"): The byte encoder to use for the reader
+        """
+        reader = codecs.getreader(encoding)
+        metadata = json.load(reader(byte_stream))
+        return cls(metadata=metadata, version=version)
+
+    @classmethod
+    def from_s3(

Review comment:
       This has been refactored to use a generic `from_file()` method that's 
dependent on PR #3691 which adds the `FileIO` abstract base class




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to