This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch pyiceberg-0.2.0
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/pyiceberg-0.2.0 by this push:
new 908761c81b Python: Fix reading UUIDs (#6486)
908761c81b is described below
commit 908761c81b6d7c12e77cabca7a2f81ee2a64384e
Author: Fokko Driesprong <[email protected]>
AuthorDate: Fri Dec 23 18:13:38 2022 +0100
Python: Fix reading UUIDs (#6486)
---
python/pyiceberg/avro/decoder.py | 5 +++++
python/pyiceberg/avro/reader.py | 10 ++++++++--
python/pyiceberg/io/pyarrow.py | 6 ++++++
python/pyiceberg/utils/schema_conversion.py | 2 +-
python/tests/avro/test_decoder.py | 7 +++++++
python/tests/avro/test_reader.py | 6 ++++++
6 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/python/pyiceberg/avro/decoder.py b/python/pyiceberg/avro/decoder.py
index f2690c4a96..f2e67d0c74 100644
--- a/python/pyiceberg/avro/decoder.py
+++ b/python/pyiceberg/avro/decoder.py
@@ -18,6 +18,7 @@ import decimal
import struct
from datetime import datetime, time
from io import SEEK_CUR
+from uuid import UUID
from pyiceberg.io import InputStream
from pyiceberg.utils.datetime import micros_to_time, micros_to_timestamp,
micros_to_timestamptz
@@ -154,6 +155,10 @@ class BinaryDecoder:
"""
return micros_to_timestamptz(self.read_int())
+ def read_uuid_from_fixed(self) -> UUID:
+ """Reads a UUID as a fixed[16]"""
+ return UUID(bytes=self.read(16))
+
def skip_boolean(self) -> None:
self.skip(1)
diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py
index 8264085bd2..bbb3f82ee7 100644
--- a/python/pyiceberg/avro/reader.py
+++ b/python/pyiceberg/avro/reader.py
@@ -64,6 +64,7 @@ from pyiceberg.types import (
TimestampType,
TimestamptzType,
TimeType,
+ UUIDType,
)
from pyiceberg.utils.singleton import Singleton
@@ -209,10 +210,10 @@ class StringReader(Reader):
class UUIDReader(Reader):
def read(self, decoder: BinaryDecoder) -> UUID:
- return UUID(decoder.read_utf8())
+ return decoder.read_uuid_from_fixed()
def skip(self, decoder: BinaryDecoder) -> None:
- decoder.skip_utf8()
+ decoder.skip(16)
@dataclass(frozen=True)
@@ -431,3 +432,8 @@ def _(_: StringType) -> Reader:
@primitive_reader.register
def _(_: BinaryType) -> Reader:
return BinaryReader()
+
+
+@primitive_reader.register
+def _(_: UUIDType) -> Reader:
+ return UUIDReader()
diff --git a/python/pyiceberg/io/pyarrow.py b/python/pyiceberg/io/pyarrow.py
index ce4e9b8ae9..e19f96fb62 100644
--- a/python/pyiceberg/io/pyarrow.py
+++ b/python/pyiceberg/io/pyarrow.py
@@ -76,6 +76,7 @@ from pyiceberg.types import (
TimestampType,
TimestamptzType,
TimeType,
+ UUIDType,
)
@@ -382,6 +383,11 @@ def _(_: StringType) -> pa.DataType:
return pa.string()
+@_iceberg_to_pyarrow_type.register
+def _(_: UUIDType) -> pa.DataType:
+ return pa.binary(16)
+
+
@_iceberg_to_pyarrow_type.register
def _(_: BinaryType) -> pa.DataType:
# Variable length by default
diff --git a/python/pyiceberg/utils/schema_conversion.py
b/python/pyiceberg/utils/schema_conversion.py
index c2bb5c93a5..2f9c321a13 100644
--- a/python/pyiceberg/utils/schema_conversion.py
+++ b/python/pyiceberg/utils/schema_conversion.py
@@ -68,7 +68,7 @@ LOGICAL_FIELD_TYPE_MAPPING: dict[tuple[str, str],
PrimitiveType] = {
("timestamp-millis", "long"): TimestampType(),
("time-micros", "int"): TimeType(),
("timestamp-micros", "long"): TimestampType(),
- ("uuid", "string"): UUIDType(),
+ ("uuid", "fixed"): UUIDType(),
}
diff --git a/python/tests/avro/test_decoder.py
b/python/tests/avro/test_decoder.py
index d48ebd0687..8a4ecf081b 100644
--- a/python/tests/avro/test_decoder.py
+++ b/python/tests/avro/test_decoder.py
@@ -17,6 +17,7 @@
from datetime import datetime, timezone
from decimal import Decimal
from io import SEEK_SET
+from uuid import UUID
import pytest
@@ -215,3 +216,9 @@ def test_read_int_as_float():
reader = promote(FloatType(), DoubleType())
assert reader.read(decoder) == 19.25
+
+
+def test_read_uuid_from_fixed() -> None:
+ mis = MemoryInputStream(b"\x12\x34\x56\x78" * 4)
+ decoder = BinaryDecoder(mis)
+ assert decoder.read_uuid_from_fixed() ==
UUID("{12345678-1234-5678-1234-567812345678}")
diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py
index ac0d001abc..5b94060f40 100644
--- a/python/tests/avro/test_reader.py
+++ b/python/tests/avro/test_reader.py
@@ -34,6 +34,7 @@ from pyiceberg.avro.reader import (
TimeReader,
TimestampReader,
TimestamptzReader,
+ UUIDReader,
primitive_reader,
)
from pyiceberg.manifest import _convert_pos_to_dict
@@ -57,6 +58,7 @@ from pyiceberg.types import (
TimestampType,
TimestamptzType,
TimeType,
+ UUIDType,
)
from tests.io.test_io import LocalInputFile
@@ -501,3 +503,7 @@ def test_unknown_type():
primitive_reader(UnknownType())
assert "Unknown type:" in str(exc_info.value)
+
+
+def test_uuid_reader() -> None:
+ assert primitive_reader(UUIDType()) == UUIDReader()