samredai commented on a change in pull request #3677:
URL: https://github.com/apache/iceberg/pull/3677#discussion_r763304666
##########
File path: python/src/iceberg/table/metadata.py
##########
@@ -0,0 +1,215 @@
+import codecs
+import json
+from typing import Any, Optional, Union
+
+import boto3
+from jsonschema import validate as validate_json
+from jsonschema.exceptions import ValidationError
+
+from iceberg.io.s3 import S3Url
+
+TABLE_METADATA_V1_SCHEMA = {
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "type": "object",
+ "properties": {
+ "format-version": {"type": "string"},
+ "table-uuid": {"type": "string"},
+ "location": {"type": "string"},
+ "last-sequence-number": {"type": "integer"},
+ "last-updated-ms": {"type": "integer"},
+ "last-column-id": {"type": "integer"},
+ "schemas": {"type": "array", "items": {}},
+ "current-schema-id": {"type": "integer"},
+ "partition-spec": {"type": "array", "items": {}},
+ "partition-specs": {"type": "array", "items": {}},
+ "default-spec-id": {"type": "integer"},
+ "last-partition-id": {"type": "integer"},
+ "properties": {"type": "object"},
+ "current-snapshot-id": {"type": "number"},
+ "snapshots": {"type": "array", "items": {}},
+ "snapshot-log": {"type": "array", "items": {}},
+ "metadata-log": {"type": "array", "items": {}},
+ "sort-orders": {"type": "array", "items": {}},
+ "default-sort-order-id": {"type": "integer"},
+ },
+ "required": [
+ "format-version",
+ "table-uuid",
+ "location",
+ "last-sequence-number",
+ "last-updated-ms",
+ "last-column-id",
+ "schemas",
+ "current-schema-id",
+ "partition-spec",
+ "partition-specs",
+ "default-spec-id",
+ "last-partition-id",
+ "properties",
+ "current-snapshot-id",
+ "snapshots",
+ "snapshot-log",
+ "metadata-log",
+ "sort-orders",
+ "default-sort-order-id",
+ ],
+}
+TABLE_METADATA_V2_SCHEMA = {
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "type": "object",
+ "properties": {
+ "format-version": {"type": "string"},
+ "table-uuid": {"type": "string"},
+ "location": {"type": "string"},
+ "last-sequence-number": {"type": "integer"},
+ "last-updated-ms": {"type": "integer"},
+ "last-column-id": {"type": "integer"},
+ "schemas": {"type": "array", "items": {}},
+ "current-schema-id": {"type": "integer"},
+ "partition-spec": {"type": "array", "items": {}},
+ "partition-specs": {"type": "array", "items": {}},
+ "default-spec-id": {"type": "integer"},
+ "last-partition-id": {"type": "integer"},
+ "properties": {"type": "object"},
+ "current-snapshot-id": {"type": "number"},
+ "snapshots": {"type": "array", "items": {}},
+ "snapshot-log": {"type": "array", "items": {}},
+ "metadata-log": {"type": "array", "items": {}},
+ "sort-orders": {"type": "array", "items": {}},
+ "default-sort-order-id": {"type": "integer"},
+ },
+ "required": [
+ "format-version",
+ "table-uuid",
+ "location",
+ "last-sequence-number",
+ "last-updated-ms",
+ "last-column-id",
+ "schemas",
+ "current-schema-id",
+ "partition-spec",
+ "partition-specs",
+ "default-spec-id",
+ "last-partition-id",
+ "properties",
+ "current-snapshot-id",
+ "snapshots",
+ "snapshot-log",
+ "metadata-log",
+ "sort-orders",
+ "default-sort-order-id",
+ ],
+}
+
+
+class TableMetadata:
+ """Metadata for an Iceberg table as specified in the Apache Iceberg
+ spec (https://iceberg.apache.org/spec/#iceberg-table-spec)
+
+ Args:
+ metadata (dict): Table metadata dictionary representation
+ version (str|int): The metadata spec version (1 or 2)
+ """
+
+ def __init__(self, metadata: dict, version: Union[str, int]):
+ self._version = version
+ self._metadata = metadata
+ for name, value in metadata.items():
+ setattr(self, self._clean_attribute_name(name), self._wrap(value))
+
+ def _wrap(self, value: Any):
+ """A recursive function that drills into iterable values and returns
+ nested TableMetadata instances
+
+ Args:
+ value: A table metadata value.
+ - If it's a string, number, or boolean, the value is returned
+ - If it's an array, the values will be iterated through and
processed and
+ added to an array that's returned
+ - If it's an object, another TableMetadata instance is returned
+
+ """
+
+ if isinstance(value, (tuple, list, set, frozenset)):
+ return type(value)([self._wrap(v) for v in value])
+ else:
+ return (
+ TableMetadata(value, version=self._version)
+ if (value and isinstance(value, dict))
+ else value
+ )
+
+ @staticmethod
+ def _clean_attribute_name(value):
+ """Fixes attribute names to be python friendly"""
+ return value.replace("-", "_").replace(".", "_")
+
+ def validate(self):
+ """Checks that the table metadata object is valid. The validation
schema
+ used depends on the Iceberg table metadata version."""
+ casted_version = int(self._version)
+ if casted_version == 1:
+ self.validate_v1(self._metadata)
+ elif casted_version == 2:
+ self.validate_v2(self._metadata)
+ else:
+ raise ValueError(f"Unknown table metadata version {self._version}")
+
+ @staticmethod
+ def validate_v1(metadata: dict):
+ """Perform a JSONSchema validation using the v1 Iceberg table metadata
schema"""
+ try:
+ validate_json(instance=metadata, schema=TABLE_METADATA_V1_SCHEMA)
+ except ValidationError as e:
+ # TODO Log something here
+ raise (e)
+
+ @staticmethod
+ def validate_v2(metadata: dict):
+ """Perform a JSONSchema validation using the v2 Iceberg table metadata
schema"""
+ try:
+ validate_json(instance=metadata, schema=TABLE_METADATA_V2_SCHEMA)
+ except ValidationError as e:
+ # TODO Log something here
+ raise (e)
+
+ @classmethod
+ def from_byte_stream(cls, byte_stream, version: Union[str, int],
encoding="utf-8"):
+ """Instantiate a TableMetadata object from a byte stream
+
+ Args:
+ byte_stream: A file-like byte stream object
+ version: The Iceberg table metadata version (1 or 2)
+ encoding (default "utf-8"): The byte encoder to use for the reader
+ """
+ reader = codecs.getreader(encoding)
+ metadata = json.load(reader(byte_stream))
+ return cls(metadata=metadata, version=version)
+
+ @classmethod
+ def from_s3(
Review comment:
Currently, this `from_s3` class method is here, but this should be
abstracted out to a generic file-io method that uses something like a `FileIO`
abstract base class.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]