This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch pyiceberg-0.2.0
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/pyiceberg-0.2.0 by this push:
     new 908761c81b Python: Fix reading UUIDs (#6486)
908761c81b is described below

commit 908761c81b6d7c12e77cabca7a2f81ee2a64384e
Author: Fokko Driesprong <[email protected]>
AuthorDate: Fri Dec 23 18:13:38 2022 +0100

    Python: Fix reading UUIDs (#6486)
---
 python/pyiceberg/avro/decoder.py            |  5 +++++
 python/pyiceberg/avro/reader.py             | 10 ++++++++--
 python/pyiceberg/io/pyarrow.py              |  6 ++++++
 python/pyiceberg/utils/schema_conversion.py |  2 +-
 python/tests/avro/test_decoder.py           |  7 +++++++
 python/tests/avro/test_reader.py            |  6 ++++++
 6 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/python/pyiceberg/avro/decoder.py b/python/pyiceberg/avro/decoder.py
index f2690c4a96..f2e67d0c74 100644
--- a/python/pyiceberg/avro/decoder.py
+++ b/python/pyiceberg/avro/decoder.py
@@ -18,6 +18,7 @@ import decimal
 import struct
 from datetime import datetime, time
 from io import SEEK_CUR
+from uuid import UUID
 
 from pyiceberg.io import InputStream
 from pyiceberg.utils.datetime import micros_to_time, micros_to_timestamp, 
micros_to_timestamptz
@@ -154,6 +155,10 @@ class BinaryDecoder:
         """
         return micros_to_timestamptz(self.read_int())
 
+    def read_uuid_from_fixed(self) -> UUID:
+        """Reads a UUID as a fixed[16]"""
+        return UUID(bytes=self.read(16))
+
     def skip_boolean(self) -> None:
         self.skip(1)
 
diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py
index 8264085bd2..bbb3f82ee7 100644
--- a/python/pyiceberg/avro/reader.py
+++ b/python/pyiceberg/avro/reader.py
@@ -64,6 +64,7 @@ from pyiceberg.types import (
     TimestampType,
     TimestamptzType,
     TimeType,
+    UUIDType,
 )
 from pyiceberg.utils.singleton import Singleton
 
@@ -209,10 +210,10 @@ class StringReader(Reader):
 
 class UUIDReader(Reader):
     def read(self, decoder: BinaryDecoder) -> UUID:
-        return UUID(decoder.read_utf8())
+        return decoder.read_uuid_from_fixed()
 
     def skip(self, decoder: BinaryDecoder) -> None:
-        decoder.skip_utf8()
+        decoder.skip(16)
 
 
 @dataclass(frozen=True)
@@ -431,3 +432,8 @@ def _(_: StringType) -> Reader:
 @primitive_reader.register
 def _(_: BinaryType) -> Reader:
     return BinaryReader()
+
+
+@primitive_reader.register
+def _(_: UUIDType) -> Reader:
+    return UUIDReader()
diff --git a/python/pyiceberg/io/pyarrow.py b/python/pyiceberg/io/pyarrow.py
index ce4e9b8ae9..e19f96fb62 100644
--- a/python/pyiceberg/io/pyarrow.py
+++ b/python/pyiceberg/io/pyarrow.py
@@ -76,6 +76,7 @@ from pyiceberg.types import (
     TimestampType,
     TimestamptzType,
     TimeType,
+    UUIDType,
 )
 
 
@@ -382,6 +383,11 @@ def _(_: StringType) -> pa.DataType:
     return pa.string()
 
 
+@_iceberg_to_pyarrow_type.register
+def _(_: UUIDType) -> pa.DataType:
+    return pa.binary(16)
+
+
 @_iceberg_to_pyarrow_type.register
 def _(_: BinaryType) -> pa.DataType:
     # Variable length by default
diff --git a/python/pyiceberg/utils/schema_conversion.py 
b/python/pyiceberg/utils/schema_conversion.py
index c2bb5c93a5..2f9c321a13 100644
--- a/python/pyiceberg/utils/schema_conversion.py
+++ b/python/pyiceberg/utils/schema_conversion.py
@@ -68,7 +68,7 @@ LOGICAL_FIELD_TYPE_MAPPING: dict[tuple[str, str], 
PrimitiveType] = {
     ("timestamp-millis", "long"): TimestampType(),
     ("time-micros", "int"): TimeType(),
     ("timestamp-micros", "long"): TimestampType(),
-    ("uuid", "string"): UUIDType(),
+    ("uuid", "fixed"): UUIDType(),
 }
 
 
diff --git a/python/tests/avro/test_decoder.py 
b/python/tests/avro/test_decoder.py
index d48ebd0687..8a4ecf081b 100644
--- a/python/tests/avro/test_decoder.py
+++ b/python/tests/avro/test_decoder.py
@@ -17,6 +17,7 @@
 from datetime import datetime, timezone
 from decimal import Decimal
 from io import SEEK_SET
+from uuid import UUID
 
 import pytest
 
@@ -215,3 +216,9 @@ def test_read_int_as_float():
     reader = promote(FloatType(), DoubleType())
 
     assert reader.read(decoder) == 19.25
+
+
+def test_read_uuid_from_fixed() -> None:
+    mis = MemoryInputStream(b"\x12\x34\x56\x78" * 4)
+    decoder = BinaryDecoder(mis)
+    assert decoder.read_uuid_from_fixed() == 
UUID("{12345678-1234-5678-1234-567812345678}")
diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py
index ac0d001abc..5b94060f40 100644
--- a/python/tests/avro/test_reader.py
+++ b/python/tests/avro/test_reader.py
@@ -34,6 +34,7 @@ from pyiceberg.avro.reader import (
     TimeReader,
     TimestampReader,
     TimestamptzReader,
+    UUIDReader,
     primitive_reader,
 )
 from pyiceberg.manifest import _convert_pos_to_dict
@@ -57,6 +58,7 @@ from pyiceberg.types import (
     TimestampType,
     TimestamptzType,
     TimeType,
+    UUIDType,
 )
 from tests.io.test_io import LocalInputFile
 
@@ -501,3 +503,7 @@ def test_unknown_type():
         primitive_reader(UnknownType())
 
     assert "Unknown type:" in str(exc_info.value)
+
+
+def test_uuid_reader() -> None:
+    assert primitive_reader(UUIDType()) == UUIDReader()

Reply via email to