This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 5a19b04302 Python: Support for UUID (#6446)
5a19b04302 is described below

commit 5a19b043027cea223a47bb1a19e637114827308d
Author: Fokko Driesprong <[email protected]>
AuthorDate: Sun Dec 18 21:10:47 2022 +0100

    Python: Support for UUID (#6446)
---
 python/pyiceberg/avro/decoder.py            |   5 ++
 python/pyiceberg/avro/reader.py             | 100 +++++++++-------------------
 python/pyiceberg/avro/resolver.py           |  13 ++--
 python/pyiceberg/schema.py                  |   2 +-
 python/pyiceberg/utils/schema_conversion.py |   2 +-
 python/tests/avro/test_decoder.py           |   7 ++
 python/tests/avro/test_reader.py            |  38 ++++++-----
 7 files changed, 75 insertions(+), 92 deletions(-)

diff --git a/python/pyiceberg/avro/decoder.py b/python/pyiceberg/avro/decoder.py
index 710b888d40..cef9de7c7b 100644
--- a/python/pyiceberg/avro/decoder.py
+++ b/python/pyiceberg/avro/decoder.py
@@ -18,6 +18,7 @@ import decimal
 import struct
 from datetime import date, datetime, time
 from io import SEEK_CUR
+from uuid import UUID
 
 from pyiceberg.io import InputStream
 from pyiceberg.utils.datetime import (
@@ -136,6 +137,10 @@ class BinaryDecoder:
         """
         return days_to_date(self.read_int())
 
+    def read_uuid_from_fixed(self) -> UUID:
+        """Reads a UUID as a fixed[16]"""
+        return UUID(bytes=self.read(16))
+
     def read_time_millis(self) -> time:
         """
         int is decoded as python time object which represents
diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py
index a7631d4888..6ea2b07ced 100644
--- a/python/pyiceberg/avro/reader.py
+++ b/python/pyiceberg/avro/reader.py
@@ -30,7 +30,6 @@ from dataclasses import dataclass
 from dataclasses import field as dataclassfield
 from datetime import date, datetime, time
 from decimal import Decimal
-from functools import singledispatch
 from typing import (
     Any,
     Callable,
@@ -43,7 +42,7 @@ from typing import (
 from uuid import UUID
 
 from pyiceberg.avro.decoder import BinaryDecoder
-from pyiceberg.schema import Schema, SchemaVisitor
+from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType
 from pyiceberg.typedef import StructProtocol
 from pyiceberg.types import (
     BinaryType,
@@ -58,12 +57,12 @@ from pyiceberg.types import (
     LongType,
     MapType,
     NestedField,
-    PrimitiveType,
     StringType,
     StructType,
     TimestampType,
     TimestamptzType,
     TimeType,
+    UUIDType,
 )
 from pyiceberg.utils.singleton import Singleton
 
@@ -209,10 +208,10 @@ class StringReader(Reader):
 
 class UUIDReader(Reader):
     def read(self, decoder: BinaryDecoder) -> UUID:
-        return UUID(decoder.read_utf8())
+        return decoder.read_uuid_from_fixed()
 
     def skip(self, decoder: BinaryDecoder) -> None:
-        decoder.skip_utf8()
+        decoder.skip(16)
 
 
 @dataclass(frozen=True)
@@ -339,7 +338,7 @@ class MapReader(Reader):
         _skip_map_array(decoder, skip)
 
 
-class ConstructReader(SchemaVisitor[Reader]):
+class ConstructReader(SchemaVisitorPerPrimitiveType[Reader]):
     def schema(self, schema: Schema, struct_result: Reader) -> Reader:
         return struct_result
 
@@ -357,77 +356,44 @@ class ConstructReader(SchemaVisitor[Reader]):
         value_reader = value_result if map_type.value_required else 
OptionReader(value_result)
         return MapReader(key_result, value_reader)
 
-    def primitive(self, primitive: PrimitiveType) -> Reader:
-        return primitive_reader(primitive)
-
-
-@singledispatch
-def primitive_reader(primitive: PrimitiveType) -> Reader:
-    raise ValueError(f"Unknown type: {primitive}")
-
-
-@primitive_reader.register
-def _(primitive: FixedType) -> Reader:
-    return FixedReader(len(primitive))
-
-
-@primitive_reader.register
-def _(primitive: DecimalType) -> Reader:
-    return DecimalReader(primitive.precision, primitive.scale)
-
-
-@primitive_reader.register
-def _(_: BooleanType) -> Reader:
-    return BooleanReader()
-
-
-@primitive_reader.register
-def _(_: IntegerType) -> Reader:
-    return IntegerReader()
-
-
-@primitive_reader.register
-def _(_: LongType) -> Reader:
-    # Ints and longs are encoded the same way in Python and
-    # also binary compatible in Avro
-    return IntegerReader()
-
-
-@primitive_reader.register
-def _(_: FloatType) -> Reader:
-    return FloatReader()
-
+    def visit_fixed(self, fixed_type: FixedType) -> Reader:
+        return FixedReader(len(fixed_type))
 
-@primitive_reader.register
-def _(_: DoubleType) -> Reader:
-    return DoubleReader()
+    def visit_decimal(self, decimal_type: DecimalType) -> Reader:
+        return DecimalReader(decimal_type.precision, decimal_type.scale)
 
+    def visit_boolean(self, boolean_type: BooleanType) -> Reader:
+        return BooleanReader()
 
-@primitive_reader.register
-def _(_: DateType) -> Reader:
-    return DateReader()
+    def visit_integer(self, integer_type: IntegerType) -> Reader:
+        return IntegerReader()
 
+    def visit_long(self, long_type: LongType) -> Reader:
+        return IntegerReader()
 
-@primitive_reader.register
-def _(_: TimeType) -> Reader:
-    return TimeReader()
+    def visit_float(self, float_type: FloatType) -> Reader:
+        return FloatReader()
 
+    def visit_double(self, double_type: DoubleType) -> Reader:
+        return DoubleReader()
 
-@primitive_reader.register
-def _(_: TimestampType) -> Reader:
-    return TimestampReader()
+    def visit_date(self, date_type: DateType) -> Reader:
+        return DateReader()
 
+    def visit_time(self, time_type: TimeType) -> Reader:
+        return TimeReader()
 
-@primitive_reader.register
-def _(_: TimestamptzType) -> Reader:
-    return TimestamptzReader()
+    def visit_timestamp(self, timestamp_type: TimestampType) -> Reader:
+        return TimestampReader()
 
+    def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Reader:
+        return TimestamptzReader()
 
-@primitive_reader.register
-def _(_: StringType) -> Reader:
-    return StringReader()
+    def visit_string(self, string_type: StringType) -> Reader:
+        return StringReader()
 
+    def visit_uuid(self, uuid_type: UUIDType) -> Reader:
+        return UUIDReader()
 
-@primitive_reader.register
-def _(_: BinaryType) -> Reader:
-    return BinaryReader()
+    def visit_binary(self, binary_ype: BinaryType) -> Reader:
+        return BinaryReader()
diff --git a/python/pyiceberg/avro/resolver.py 
b/python/pyiceberg/avro/resolver.py
index 1b710ea191..ca559a2998 100644
--- a/python/pyiceberg/avro/resolver.py
+++ b/python/pyiceberg/avro/resolver.py
@@ -30,7 +30,6 @@ from pyiceberg.avro.reader import (
     OptionReader,
     Reader,
     StructReader,
-    primitive_reader,
 )
 from pyiceberg.schema import Schema, visit
 from pyiceberg.types import (
@@ -133,7 +132,7 @@ def _(file_type: PrimitiveType, read_type: IcebergType) -> 
Reader:
     # In the case of a promotion, we want to check if it is valid
     if file_type != read_type:
         return promote(file_type, read_type)
-    return primitive_reader(read_type)
+    return visit(read_type, ConstructReader())
 
 
 @singledispatch
@@ -154,7 +153,7 @@ def promote(file_type: IcebergType, read_type: IcebergType) 
-> Reader:
 def _(file_type: IntegerType, read_type: IcebergType) -> Reader:
     if isinstance(read_type, LongType):
         # Ints/Longs are binary compatible in Avro, so this is okay
-        return primitive_reader(read_type)
+        return visit(read_type, ConstructReader())
     else:
         raise ResolveException(f"Cannot promote an int to {read_type}")
 
@@ -163,7 +162,7 @@ def _(file_type: IntegerType, read_type: IcebergType) -> 
Reader:
 def _(file_type: FloatType, read_type: IcebergType) -> Reader:
     if isinstance(read_type, DoubleType):
         # We should just read the float, and return it, since it both returns 
a float
-        return primitive_reader(file_type)
+        return visit(file_type, ConstructReader())
     else:
         raise ResolveException(f"Cannot promote an float to {read_type}")
 
@@ -171,7 +170,7 @@ def _(file_type: FloatType, read_type: IcebergType) -> 
Reader:
 @promote.register(StringType)
 def _(file_type: StringType, read_type: IcebergType) -> Reader:
     if isinstance(read_type, BinaryType):
-        return primitive_reader(read_type)
+        return visit(read_type, ConstructReader())
     else:
         raise ResolveException(f"Cannot promote an string to {read_type}")
 
@@ -179,7 +178,7 @@ def _(file_type: StringType, read_type: IcebergType) -> 
Reader:
 @promote.register(BinaryType)
 def _(file_type: BinaryType, read_type: IcebergType) -> Reader:
     if isinstance(read_type, StringType):
-        return primitive_reader(read_type)
+        return visit(read_type, ConstructReader())
     else:
         raise ResolveException(f"Cannot promote an binary to {read_type}")
 
@@ -188,7 +187,7 @@ def _(file_type: BinaryType, read_type: IcebergType) -> 
Reader:
 def _(file_type: DecimalType, read_type: IcebergType) -> Reader:
     if isinstance(read_type, DecimalType):
         if file_type.precision <= read_type.precision and file_type.scale == 
file_type.scale:
-            return primitive_reader(read_type)
+            return visit(read_type, ConstructReader())
         else:
             raise ResolveException(f"Cannot reduce precision from {file_type} 
to {read_type}")
     else:
diff --git a/python/pyiceberg/schema.py b/python/pyiceberg/schema.py
index ae11b5ba32..b265c444f1 100644
--- a/python/pyiceberg/schema.py
+++ b/python/pyiceberg/schema.py
@@ -363,7 +363,7 @@ class SchemaVisitorPerPrimitiveType(SchemaVisitor[T], ABC):
         elif isinstance(primitive, BinaryType):
             return self.visit_binary(primitive)
         else:
-            raise ValueError(f"Found unknown type: {primitive}")
+            raise ValueError(f"Unknown type: {primitive}")
 
     @abstractmethod
     def visit_fixed(self, fixed_type: FixedType) -> T:
diff --git a/python/pyiceberg/utils/schema_conversion.py 
b/python/pyiceberg/utils/schema_conversion.py
index c2bb5c93a5..2f9c321a13 100644
--- a/python/pyiceberg/utils/schema_conversion.py
+++ b/python/pyiceberg/utils/schema_conversion.py
@@ -68,7 +68,7 @@ LOGICAL_FIELD_TYPE_MAPPING: dict[tuple[str, str], 
PrimitiveType] = {
     ("timestamp-millis", "long"): TimestampType(),
     ("time-micros", "int"): TimeType(),
     ("timestamp-micros", "long"): TimestampType(),
-    ("uuid", "string"): UUIDType(),
+    ("uuid", "fixed"): UUIDType(),
 }
 
 
diff --git a/python/tests/avro/test_decoder.py 
b/python/tests/avro/test_decoder.py
index 9bbd67db74..ac38f38868 100644
--- a/python/tests/avro/test_decoder.py
+++ b/python/tests/avro/test_decoder.py
@@ -21,6 +21,7 @@ from decimal import Decimal
 from io import SEEK_SET
 from types import TracebackType
 from typing import Optional, Type
+from uuid import UUID
 
 import pytest
 
@@ -176,6 +177,12 @@ def test_read_date() -> None:
     assert decoder.read_date_from_int() == date(1991, 12, 27)
 
 
+def test_read_uuid_from_fixed() -> None:
+    mis = MemoryInputStream(b"\x12\x34\x56\x78" * 4)
+    decoder = BinaryDecoder(mis)
+    assert decoder.read_uuid_from_fixed() == 
UUID("{12345678-1234-5678-1234-567812345678}")
+
+
 def test_read_time_millis() -> None:
     mis = MemoryInputStream(b"\xBC\x7D")
     decoder = BinaryDecoder(mis)
diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py
index da1613be95..fc37f9f9a3 100644
--- a/python/tests/avro/test_reader.py
+++ b/python/tests/avro/test_reader.py
@@ -24,6 +24,7 @@ from pyiceberg.avro.reader import (
     AvroStruct,
     BinaryReader,
     BooleanReader,
+    ConstructReader,
     DateReader,
     DecimalReader,
     DoubleReader,
@@ -34,10 +35,10 @@ from pyiceberg.avro.reader import (
     TimeReader,
     TimestampReader,
     TimestamptzReader,
-    primitive_reader,
+    UUIDReader,
 )
 from pyiceberg.manifest import _convert_pos_to_dict
-from pyiceberg.schema import Schema
+from pyiceberg.schema import Schema, visit
 from pyiceberg.types import (
     BinaryType,
     BooleanType,
@@ -57,6 +58,7 @@ from pyiceberg.types import (
     TimestampType,
     TimestamptzType,
     TimeType,
+    UUIDType,
 )
 from tests.io.test_io import LocalInputFile
 
@@ -442,55 +444,55 @@ def test_null_struct_convert_pos_to_dict() -> None:
 
 
 def test_fixed_reader() -> None:
-    assert primitive_reader(FixedType(22)) == FixedReader(22)
+    assert visit(FixedType(22), ConstructReader()) == FixedReader(22)
 
 
 def test_decimal_reader() -> None:
-    assert primitive_reader(DecimalType(19, 25)) == DecimalReader(19, 25)
+    assert visit(DecimalType(19, 25), ConstructReader()) == DecimalReader(19, 
25)
 
 
 def test_boolean_reader() -> None:
-    assert primitive_reader(BooleanType()) == BooleanReader()
+    assert visit(BooleanType(), ConstructReader()) == BooleanReader()
 
 
 def test_integer_reader() -> None:
-    assert primitive_reader(IntegerType()) == IntegerReader()
+    assert visit(IntegerType(), ConstructReader()) == IntegerReader()
 
 
 def test_long_reader() -> None:
-    assert primitive_reader(LongType()) == IntegerReader()
+    assert visit(LongType(), ConstructReader()) == IntegerReader()
 
 
 def test_float_reader() -> None:
-    assert primitive_reader(FloatType()) == FloatReader()
+    assert visit(FloatType(), ConstructReader()) == FloatReader()
 
 
 def test_double_reader() -> None:
-    assert primitive_reader(DoubleType()) == DoubleReader()
+    assert visit(DoubleType(), ConstructReader()) == DoubleReader()
 
 
 def test_date_reader() -> None:
-    assert primitive_reader(DateType()) == DateReader()
+    assert visit(DateType(), ConstructReader()) == DateReader()
 
 
 def test_time_reader() -> None:
-    assert primitive_reader(TimeType()) == TimeReader()
+    assert visit(TimeType(), ConstructReader()) == TimeReader()
 
 
 def test_timestamp_reader() -> None:
-    assert primitive_reader(TimestampType()) == TimestampReader()
+    assert visit(TimestampType(), ConstructReader()) == TimestampReader()
 
 
 def test_timestamptz_reader() -> None:
-    assert primitive_reader(TimestamptzType()) == TimestamptzReader()
+    assert visit(TimestamptzType(), ConstructReader()) == TimestamptzReader()
 
 
 def test_string_reader() -> None:
-    assert primitive_reader(StringType()) == StringReader()
+    assert visit(StringType(), ConstructReader()) == StringReader()
 
 
 def test_binary_reader() -> None:
-    assert primitive_reader(BinaryType()) == BinaryReader()
+    assert visit(BinaryType(), ConstructReader()) == BinaryReader()
 
 
 def test_unknown_type() -> None:
@@ -498,6 +500,10 @@ def test_unknown_type() -> None:
         __root__ = "UnknownType"
 
     with pytest.raises(ValueError) as exc_info:
-        primitive_reader(UnknownType())
+        visit(UnknownType(), ConstructReader())
 
     assert "Unknown type:" in str(exc_info.value)
+
+
+def test_uuid_reader() -> None:
+    assert visit(UUIDType(), ConstructReader()) == UUIDReader()

Reply via email to