This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 5a19b04302 Python: Support for UUID (#6446)
5a19b04302 is described below
commit 5a19b043027cea223a47bb1a19e637114827308d
Author: Fokko Driesprong <[email protected]>
AuthorDate: Sun Dec 18 21:10:47 2022 +0100
Python: Support for UUID (#6446)
---
python/pyiceberg/avro/decoder.py | 5 ++
python/pyiceberg/avro/reader.py | 100 +++++++++-------------------
python/pyiceberg/avro/resolver.py | 13 ++--
python/pyiceberg/schema.py | 2 +-
python/pyiceberg/utils/schema_conversion.py | 2 +-
python/tests/avro/test_decoder.py | 7 ++
python/tests/avro/test_reader.py | 38 ++++++-----
7 files changed, 75 insertions(+), 92 deletions(-)
diff --git a/python/pyiceberg/avro/decoder.py b/python/pyiceberg/avro/decoder.py
index 710b888d40..cef9de7c7b 100644
--- a/python/pyiceberg/avro/decoder.py
+++ b/python/pyiceberg/avro/decoder.py
@@ -18,6 +18,7 @@ import decimal
import struct
from datetime import date, datetime, time
from io import SEEK_CUR
+from uuid import UUID
from pyiceberg.io import InputStream
from pyiceberg.utils.datetime import (
@@ -136,6 +137,10 @@ class BinaryDecoder:
"""
return days_to_date(self.read_int())
+ def read_uuid_from_fixed(self) -> UUID:
+ """Reads a UUID as a fixed[16]"""
+ return UUID(bytes=self.read(16))
+
def read_time_millis(self) -> time:
"""
int is decoded as python time object which represents
diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py
index a7631d4888..6ea2b07ced 100644
--- a/python/pyiceberg/avro/reader.py
+++ b/python/pyiceberg/avro/reader.py
@@ -30,7 +30,6 @@ from dataclasses import dataclass
from dataclasses import field as dataclassfield
from datetime import date, datetime, time
from decimal import Decimal
-from functools import singledispatch
from typing import (
Any,
Callable,
@@ -43,7 +42,7 @@ from typing import (
from uuid import UUID
from pyiceberg.avro.decoder import BinaryDecoder
-from pyiceberg.schema import Schema, SchemaVisitor
+from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType
from pyiceberg.typedef import StructProtocol
from pyiceberg.types import (
BinaryType,
@@ -58,12 +57,12 @@ from pyiceberg.types import (
LongType,
MapType,
NestedField,
- PrimitiveType,
StringType,
StructType,
TimestampType,
TimestamptzType,
TimeType,
+ UUIDType,
)
from pyiceberg.utils.singleton import Singleton
@@ -209,10 +208,10 @@ class StringReader(Reader):
class UUIDReader(Reader):
def read(self, decoder: BinaryDecoder) -> UUID:
- return UUID(decoder.read_utf8())
+ return decoder.read_uuid_from_fixed()
def skip(self, decoder: BinaryDecoder) -> None:
- decoder.skip_utf8()
+ decoder.skip(16)
@dataclass(frozen=True)
@@ -339,7 +338,7 @@ class MapReader(Reader):
_skip_map_array(decoder, skip)
-class ConstructReader(SchemaVisitor[Reader]):
+class ConstructReader(SchemaVisitorPerPrimitiveType[Reader]):
def schema(self, schema: Schema, struct_result: Reader) -> Reader:
return struct_result
@@ -357,77 +356,44 @@ class ConstructReader(SchemaVisitor[Reader]):
value_reader = value_result if map_type.value_required else
OptionReader(value_result)
return MapReader(key_result, value_reader)
- def primitive(self, primitive: PrimitiveType) -> Reader:
- return primitive_reader(primitive)
-
-
-@singledispatch
-def primitive_reader(primitive: PrimitiveType) -> Reader:
- raise ValueError(f"Unknown type: {primitive}")
-
-
-@primitive_reader.register
-def _(primitive: FixedType) -> Reader:
- return FixedReader(len(primitive))
-
-
-@primitive_reader.register
-def _(primitive: DecimalType) -> Reader:
- return DecimalReader(primitive.precision, primitive.scale)
-
-
-@primitive_reader.register
-def _(_: BooleanType) -> Reader:
- return BooleanReader()
-
-
-@primitive_reader.register
-def _(_: IntegerType) -> Reader:
- return IntegerReader()
-
-
-@primitive_reader.register
-def _(_: LongType) -> Reader:
- # Ints and longs are encoded the same way in Python and
- # also binary compatible in Avro
- return IntegerReader()
-
-
-@primitive_reader.register
-def _(_: FloatType) -> Reader:
- return FloatReader()
-
+ def visit_fixed(self, fixed_type: FixedType) -> Reader:
+ return FixedReader(len(fixed_type))
-@primitive_reader.register
-def _(_: DoubleType) -> Reader:
- return DoubleReader()
+ def visit_decimal(self, decimal_type: DecimalType) -> Reader:
+ return DecimalReader(decimal_type.precision, decimal_type.scale)
+ def visit_boolean(self, boolean_type: BooleanType) -> Reader:
+ return BooleanReader()
-@primitive_reader.register
-def _(_: DateType) -> Reader:
- return DateReader()
+ def visit_integer(self, integer_type: IntegerType) -> Reader:
+ return IntegerReader()
+ def visit_long(self, long_type: LongType) -> Reader:
+ return IntegerReader()
-@primitive_reader.register
-def _(_: TimeType) -> Reader:
- return TimeReader()
+ def visit_float(self, float_type: FloatType) -> Reader:
+ return FloatReader()
+ def visit_double(self, double_type: DoubleType) -> Reader:
+ return DoubleReader()
-@primitive_reader.register
-def _(_: TimestampType) -> Reader:
- return TimestampReader()
+ def visit_date(self, date_type: DateType) -> Reader:
+ return DateReader()
+ def visit_time(self, time_type: TimeType) -> Reader:
+ return TimeReader()
-@primitive_reader.register
-def _(_: TimestamptzType) -> Reader:
- return TimestamptzReader()
+ def visit_timestamp(self, timestamp_type: TimestampType) -> Reader:
+ return TimestampReader()
+ def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Reader:
+ return TimestamptzReader()
-@primitive_reader.register
-def _(_: StringType) -> Reader:
- return StringReader()
+ def visit_string(self, string_type: StringType) -> Reader:
+ return StringReader()
+ def visit_uuid(self, uuid_type: UUIDType) -> Reader:
+ return UUIDReader()
-@primitive_reader.register
-def _(_: BinaryType) -> Reader:
- return BinaryReader()
+ def visit_binary(self, binary_ype: BinaryType) -> Reader:
+ return BinaryReader()
diff --git a/python/pyiceberg/avro/resolver.py
b/python/pyiceberg/avro/resolver.py
index 1b710ea191..ca559a2998 100644
--- a/python/pyiceberg/avro/resolver.py
+++ b/python/pyiceberg/avro/resolver.py
@@ -30,7 +30,6 @@ from pyiceberg.avro.reader import (
OptionReader,
Reader,
StructReader,
- primitive_reader,
)
from pyiceberg.schema import Schema, visit
from pyiceberg.types import (
@@ -133,7 +132,7 @@ def _(file_type: PrimitiveType, read_type: IcebergType) ->
Reader:
# In the case of a promotion, we want to check if it is valid
if file_type != read_type:
return promote(file_type, read_type)
- return primitive_reader(read_type)
+ return visit(read_type, ConstructReader())
@singledispatch
@@ -154,7 +153,7 @@ def promote(file_type: IcebergType, read_type: IcebergType)
-> Reader:
def _(file_type: IntegerType, read_type: IcebergType) -> Reader:
if isinstance(read_type, LongType):
# Ints/Longs are binary compatible in Avro, so this is okay
- return primitive_reader(read_type)
+ return visit(read_type, ConstructReader())
else:
raise ResolveException(f"Cannot promote an int to {read_type}")
@@ -163,7 +162,7 @@ def _(file_type: IntegerType, read_type: IcebergType) ->
Reader:
def _(file_type: FloatType, read_type: IcebergType) -> Reader:
if isinstance(read_type, DoubleType):
# We should just read the float, and return it, since it both returns
a float
- return primitive_reader(file_type)
+ return visit(file_type, ConstructReader())
else:
raise ResolveException(f"Cannot promote an float to {read_type}")
@@ -171,7 +170,7 @@ def _(file_type: FloatType, read_type: IcebergType) ->
Reader:
@promote.register(StringType)
def _(file_type: StringType, read_type: IcebergType) -> Reader:
if isinstance(read_type, BinaryType):
- return primitive_reader(read_type)
+ return visit(read_type, ConstructReader())
else:
raise ResolveException(f"Cannot promote an string to {read_type}")
@@ -179,7 +178,7 @@ def _(file_type: StringType, read_type: IcebergType) ->
Reader:
@promote.register(BinaryType)
def _(file_type: BinaryType, read_type: IcebergType) -> Reader:
if isinstance(read_type, StringType):
- return primitive_reader(read_type)
+ return visit(read_type, ConstructReader())
else:
raise ResolveException(f"Cannot promote an binary to {read_type}")
@@ -188,7 +187,7 @@ def _(file_type: BinaryType, read_type: IcebergType) ->
Reader:
def _(file_type: DecimalType, read_type: IcebergType) -> Reader:
if isinstance(read_type, DecimalType):
if file_type.precision <= read_type.precision and file_type.scale ==
file_type.scale:
- return primitive_reader(read_type)
+ return visit(read_type, ConstructReader())
else:
raise ResolveException(f"Cannot reduce precision from {file_type}
to {read_type}")
else:
diff --git a/python/pyiceberg/schema.py b/python/pyiceberg/schema.py
index ae11b5ba32..b265c444f1 100644
--- a/python/pyiceberg/schema.py
+++ b/python/pyiceberg/schema.py
@@ -363,7 +363,7 @@ class SchemaVisitorPerPrimitiveType(SchemaVisitor[T], ABC):
elif isinstance(primitive, BinaryType):
return self.visit_binary(primitive)
else:
- raise ValueError(f"Found unknown type: {primitive}")
+ raise ValueError(f"Unknown type: {primitive}")
@abstractmethod
def visit_fixed(self, fixed_type: FixedType) -> T:
diff --git a/python/pyiceberg/utils/schema_conversion.py
b/python/pyiceberg/utils/schema_conversion.py
index c2bb5c93a5..2f9c321a13 100644
--- a/python/pyiceberg/utils/schema_conversion.py
+++ b/python/pyiceberg/utils/schema_conversion.py
@@ -68,7 +68,7 @@ LOGICAL_FIELD_TYPE_MAPPING: dict[tuple[str, str],
PrimitiveType] = {
("timestamp-millis", "long"): TimestampType(),
("time-micros", "int"): TimeType(),
("timestamp-micros", "long"): TimestampType(),
- ("uuid", "string"): UUIDType(),
+ ("uuid", "fixed"): UUIDType(),
}
diff --git a/python/tests/avro/test_decoder.py
b/python/tests/avro/test_decoder.py
index 9bbd67db74..ac38f38868 100644
--- a/python/tests/avro/test_decoder.py
+++ b/python/tests/avro/test_decoder.py
@@ -21,6 +21,7 @@ from decimal import Decimal
from io import SEEK_SET
from types import TracebackType
from typing import Optional, Type
+from uuid import UUID
import pytest
@@ -176,6 +177,12 @@ def test_read_date() -> None:
assert decoder.read_date_from_int() == date(1991, 12, 27)
+def test_read_uuid_from_fixed() -> None:
+ mis = MemoryInputStream(b"\x12\x34\x56\x78" * 4)
+ decoder = BinaryDecoder(mis)
+ assert decoder.read_uuid_from_fixed() ==
UUID("{12345678-1234-5678-1234-567812345678}")
+
+
def test_read_time_millis() -> None:
mis = MemoryInputStream(b"\xBC\x7D")
decoder = BinaryDecoder(mis)
diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py
index da1613be95..fc37f9f9a3 100644
--- a/python/tests/avro/test_reader.py
+++ b/python/tests/avro/test_reader.py
@@ -24,6 +24,7 @@ from pyiceberg.avro.reader import (
AvroStruct,
BinaryReader,
BooleanReader,
+ ConstructReader,
DateReader,
DecimalReader,
DoubleReader,
@@ -34,10 +35,10 @@ from pyiceberg.avro.reader import (
TimeReader,
TimestampReader,
TimestamptzReader,
- primitive_reader,
+ UUIDReader,
)
from pyiceberg.manifest import _convert_pos_to_dict
-from pyiceberg.schema import Schema
+from pyiceberg.schema import Schema, visit
from pyiceberg.types import (
BinaryType,
BooleanType,
@@ -57,6 +58,7 @@ from pyiceberg.types import (
TimestampType,
TimestamptzType,
TimeType,
+ UUIDType,
)
from tests.io.test_io import LocalInputFile
@@ -442,55 +444,55 @@ def test_null_struct_convert_pos_to_dict() -> None:
def test_fixed_reader() -> None:
- assert primitive_reader(FixedType(22)) == FixedReader(22)
+ assert visit(FixedType(22), ConstructReader()) == FixedReader(22)
def test_decimal_reader() -> None:
- assert primitive_reader(DecimalType(19, 25)) == DecimalReader(19, 25)
+ assert visit(DecimalType(19, 25), ConstructReader()) == DecimalReader(19,
25)
def test_boolean_reader() -> None:
- assert primitive_reader(BooleanType()) == BooleanReader()
+ assert visit(BooleanType(), ConstructReader()) == BooleanReader()
def test_integer_reader() -> None:
- assert primitive_reader(IntegerType()) == IntegerReader()
+ assert visit(IntegerType(), ConstructReader()) == IntegerReader()
def test_long_reader() -> None:
- assert primitive_reader(LongType()) == IntegerReader()
+ assert visit(LongType(), ConstructReader()) == IntegerReader()
def test_float_reader() -> None:
- assert primitive_reader(FloatType()) == FloatReader()
+ assert visit(FloatType(), ConstructReader()) == FloatReader()
def test_double_reader() -> None:
- assert primitive_reader(DoubleType()) == DoubleReader()
+ assert visit(DoubleType(), ConstructReader()) == DoubleReader()
def test_date_reader() -> None:
- assert primitive_reader(DateType()) == DateReader()
+ assert visit(DateType(), ConstructReader()) == DateReader()
def test_time_reader() -> None:
- assert primitive_reader(TimeType()) == TimeReader()
+ assert visit(TimeType(), ConstructReader()) == TimeReader()
def test_timestamp_reader() -> None:
- assert primitive_reader(TimestampType()) == TimestampReader()
+ assert visit(TimestampType(), ConstructReader()) == TimestampReader()
def test_timestamptz_reader() -> None:
- assert primitive_reader(TimestamptzType()) == TimestamptzReader()
+ assert visit(TimestamptzType(), ConstructReader()) == TimestamptzReader()
def test_string_reader() -> None:
- assert primitive_reader(StringType()) == StringReader()
+ assert visit(StringType(), ConstructReader()) == StringReader()
def test_binary_reader() -> None:
- assert primitive_reader(BinaryType()) == BinaryReader()
+ assert visit(BinaryType(), ConstructReader()) == BinaryReader()
def test_unknown_type() -> None:
@@ -498,6 +500,10 @@ def test_unknown_type() -> None:
__root__ = "UnknownType"
with pytest.raises(ValueError) as exc_info:
- primitive_reader(UnknownType())
+ visit(UnknownType(), ConstructReader())
assert "Unknown type:" in str(exc_info.value)
+
+
+def test_uuid_reader() -> None:
+ assert visit(UUIDType(), ConstructReader()) == UUIDReader()