This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 2535c3a8b1 Python: Change UUID representation to bytes (#8267)
2535c3a8b1 is described below

commit 2535c3a8b18910a926df45d267271aecd83317b3
Author: HonahX <[email protected]>
AuthorDate: Thu Aug 10 01:17:24 2023 -0700

    Python: Change UUID representation to bytes (#8267)
    
    * Change UUID Inner Representation to bytes, add integration tests for uuid 
and fixed
    
    * address review comments
    
    * optimize transform code
    
    * supports conversion from bytes or fixed to uuid
---
 python/dev/Dockerfile                     |  3 ++
 python/dev/provision.py                   | 33 ++++++++++++++
 python/pyiceberg/conversions.py           | 14 +++---
 python/pyiceberg/expressions/literals.py  | 34 ++++++++++++---
 python/pyiceberg/io/pyarrow.py            |  2 +-
 python/pyiceberg/transforms.py            | 11 ++---
 python/tests/expressions/test_literals.py | 42 +++++++++++++++++-
 python/tests/test_conversions.py          | 24 +++++++++--
 python/tests/test_integration.py          | 37 ++++++++++++++++
 python/tests/test_schema.py               | 72 ++++++++++++++++++++++++++++++-
 python/tests/test_transforms.py           |  6 +++
 11 files changed, 247 insertions(+), 31 deletions(-)

diff --git a/python/dev/Dockerfile b/python/dev/Dockerfile
index 21f732e774..a4099d3494 100644
--- a/python/dev/Dockerfile
+++ b/python/dev/Dockerfile
@@ -40,6 +40,7 @@ ENV SPARK_VERSION=3.4.1
 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.4_2.12
 ENV ICEBERG_VERSION=1.3.1
 ENV AWS_SDK_VERSION=2.20.18
+ENV PYICEBERG_VERSION=0.4.0
 
 RUN curl --retry 3 -s -C - 
https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz
 -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
  && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark 
--strip-components 1 \
@@ -65,6 +66,8 @@ RUN chmod u+x /opt/spark/sbin/* && \
 
 RUN pip3 install -q ipython
 
+RUN pip3 install "pyiceberg[s3fs]==${PYICEBERG_VERSION}"
+
 COPY entrypoint.sh .
 COPY provision.py .
 
diff --git a/python/dev/provision.py b/python/dev/provision.py
index f62687b746..37c5fec973 100644
--- a/python/dev/provision.py
+++ b/python/dev/provision.py
@@ -18,6 +18,10 @@
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import current_date, date_add, expr
 
+from pyiceberg.catalog import load_catalog
+from pyiceberg.schema import Schema
+from pyiceberg.types import FixedType, NestedField, UUIDType
+
 spark = SparkSession.builder.getOrCreate()
 
 spark.sql(
@@ -26,6 +30,35 @@ spark.sql(
 """
 )
 
+schema = Schema(
+    NestedField(field_id=1, name="uuid_col", field_type=UUIDType(), 
required=False),
+    NestedField(field_id=2, name="fixed_col", field_type=FixedType(25), 
required=False),
+)
+
+catalog = load_catalog(
+    "local",
+    **{
+        "type": "rest",
+        "uri": "http://rest:8181";,
+        "s3.endpoint": "http://minio:9000";,
+        "s3.access-key-id": "admin",
+        "s3.secret-access-key": "password",
+    },
+)
+
+catalog.create_table(identifier="default.test_uuid_and_fixed_unpartitioned", 
schema=schema)
+
+spark.sql(
+    """
+    INSERT INTO default.test_uuid_and_fixed_unpartitioned VALUES
+    ('102cb62f-e6f8-4eb0-9973-d9b012ff0967', CAST('1234567890123456789012345' 
AS BINARY)),
+    ('ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226', CAST('1231231231231231231231231' 
AS BINARY)),
+    ('639cccce-c9d2-494a-a78c-278ab234f024', CAST('12345678901234567ass12345' 
AS BINARY)),
+    ('c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b', CAST('asdasasdads12312312312111' 
AS BINARY)),
+    ('923dae77-83d6-47cd-b4b0-d383e64ee57e', CAST('qweeqwwqq1231231231231111' 
AS BINARY));
+    """
+)
+
 spark.sql(
     """
   CREATE OR REPLACE TABLE default.test_null_nan
diff --git a/python/pyiceberg/conversions.py b/python/pyiceberg/conversions.py
index 8f155fce3d..0b3f36fba9 100644
--- a/python/pyiceberg/conversions.py
+++ b/python/pyiceberg/conversions.py
@@ -65,7 +65,6 @@ _INT_STRUCT = Struct("<i")
 _LONG_STRUCT = Struct("<q")
 _FLOAT_STRUCT = Struct("<f")
 _DOUBLE_STRUCT = Struct("<d")
-_UUID_STRUCT = Struct(">QQ")
 
 
 def handle_none(func: Callable) -> Callable:  # type: ignore
@@ -228,8 +227,10 @@ def _(_: StringType, value: str) -> bytes:
 
 
 @to_bytes.register(UUIDType)
-def _(_: UUIDType, value: uuid.UUID) -> bytes:
-    return _UUID_STRUCT.pack((value.int >> 64) & 0xFFFFFFFFFFFFFFFF, value.int 
& 0xFFFFFFFFFFFFFFFF)
+def _(_: UUIDType, value: Union[uuid.UUID, bytes]) -> bytes:
+    if isinstance(value, bytes):
+        return value
+    return value.bytes
 
 
 @to_bytes.register(BinaryType)
@@ -310,14 +311,9 @@ def _(_: StringType, b: bytes) -> str:
     return bytes(b).decode("utf-8")
 
 
-@from_bytes.register(UUIDType)
-def _(_: UUIDType, b: bytes) -> uuid.UUID:
-    unpacked_bytes = _UUID_STRUCT.unpack(b)
-    return uuid.UUID(int=unpacked_bytes[0] << 64 | unpacked_bytes[1])
-
-
 @from_bytes.register(BinaryType)
 @from_bytes.register(FixedType)
+@from_bytes.register(UUIDType)
 def _(_: PrimitiveType, b: bytes) -> bytes:
     return b
 
diff --git a/python/pyiceberg/expressions/literals.py 
b/python/pyiceberg/expressions/literals.py
index f89d0c8331..b24f3932d5 100644
--- a/python/pyiceberg/expressions/literals.py
+++ b/python/pyiceberg/expressions/literals.py
@@ -57,6 +57,8 @@ from pyiceberg.utils.datetime import (
 from pyiceberg.utils.decimal import decimal_to_unscaled, unscaled_to_decimal
 from pyiceberg.utils.singleton import Singleton
 
+UUID_BYTES_LENGTH = 16
+
 
 class Literal(Generic[L], ABC):
     """Literal which has a value and can be converted between types."""
@@ -139,7 +141,7 @@ def literal(value: L) -> Literal[L]:
     elif isinstance(value, str):
         return StringLiteral(value)
     elif isinstance(value, UUID):
-        return UUIDLiteral(value)
+        return UUIDLiteral(value.bytes)  # type: ignore
     elif isinstance(value, bytes):
         return BinaryLiteral(value)
     elif isinstance(value, Decimal):
@@ -571,8 +573,8 @@ class StringLiteral(Literal[str]):
         return TimestampLiteral(timestamptz_to_micros(self.value))
 
     @to.register(UUIDType)
-    def _(self, _: UUIDType) -> Literal[UUID]:
-        return UUIDLiteral(UUID(self.value))
+    def _(self, _: UUIDType) -> Literal[bytes]:
+        return UUIDLiteral(UUID(self.value).bytes)
 
     @to.register(DecimalType)
     def _(self, type_var: DecimalType) -> Literal[Decimal]:
@@ -596,16 +598,16 @@ class StringLiteral(Literal[str]):
         return f"literal({repr(self.value)})"
 
 
-class UUIDLiteral(Literal[UUID]):
-    def __init__(self, value: UUID) -> None:
-        super().__init__(value, UUID)
+class UUIDLiteral(Literal[bytes]):
+    def __init__(self, value: bytes) -> None:
+        super().__init__(value, bytes)
 
     @singledispatchmethod
     def to(self, type_var: IcebergType) -> Literal:  # type: ignore
         raise TypeError(f"Cannot convert UUIDLiteral into {type_var}")
 
     @to.register(UUIDType)
-    def _(self, _: UUIDType) -> Literal[UUID]:
+    def _(self, _: UUIDType) -> Literal[bytes]:
         return self
 
 
@@ -630,6 +632,15 @@ class FixedLiteral(Literal[bytes]):
     def _(self, _: BinaryType) -> Literal[bytes]:
         return BinaryLiteral(self.value)
 
+    @to.register(UUIDType)
+    def _(self, type_var: UUIDType) -> Literal[bytes]:
+        if len(self.value) == UUID_BYTES_LENGTH:
+            return UUIDLiteral(self.value)
+        else:
+            raise TypeError(
+                f"Could not convert {self.value!r} into a {type_var}, lengths 
differ {len(self.value)} <> {UUID_BYTES_LENGTH}"
+            )
+
 
 class BinaryLiteral(Literal[bytes]):
     def __init__(self, value: bytes) -> None:
@@ -651,3 +662,12 @@ class BinaryLiteral(Literal[bytes]):
             raise TypeError(
                 f"Cannot convert BinaryLiteral into {type_var}, different 
length: {len(type_var)} <> {len(self.value)}"
             )
+
+    @to.register(UUIDType)
+    def _(self, type_var: UUIDType) -> Literal[bytes]:
+        if len(self.value) == UUID_BYTES_LENGTH:
+            return UUIDLiteral(self.value)
+        else:
+            raise TypeError(
+                f"Cannot convert BinaryLiteral into {type_var}, different 
length: {UUID_BYTES_LENGTH} <> {len(self.value)}"
+            )
diff --git a/python/pyiceberg/io/pyarrow.py b/python/pyiceberg/io/pyarrow.py
index 4255072003..fba16f9992 100644
--- a/python/pyiceberg/io/pyarrow.py
+++ b/python/pyiceberg/io/pyarrow.py
@@ -451,7 +451,7 @@ class 
_ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType], Singleto
 def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar:
     if not isinstance(iceberg_type, PrimitiveType):
         raise ValueError(f"Expected primitive type, got: {iceberg_type}")
-    return pa.scalar(value).cast(schema_to_pyarrow(iceberg_type))
+    return pa.scalar(value=value, type=schema_to_pyarrow(iceberg_type))
 
 
 class _ConvertToArrowExpression(BoundBooleanExpressionVisitor[pc.Expression]):
diff --git a/python/pyiceberg/transforms.py b/python/pyiceberg/transforms.py
index 4b67f66873..3e90c911d1 100644
--- a/python/pyiceberg/transforms.py
+++ b/python/pyiceberg/transforms.py
@@ -28,6 +28,7 @@ from typing import (
 )
 from typing import Literal as LiteralType
 from typing import Optional, TypeVar
+from uuid import UUID
 
 import mmh3
 from pydantic import Field, PositiveInt, PrivateAttr
@@ -269,13 +270,9 @@ class BucketTransform(Transform[S, int]):
         elif source_type == UUIDType:
 
             def hash_func(v: Any) -> int:
-                return mmh3.hash(
-                    struct.pack(
-                        ">QQ",
-                        (v.int >> 64) & 0xFFFFFFFFFFFFFFFF,
-                        v.int & 0xFFFFFFFFFFFFFFFF,
-                    )
-                )
+                if isinstance(v, UUID):
+                    return mmh3.hash(v.bytes)
+                return mmh3.hash(v)
 
         else:
             raise ValueError(f"Unknown type {source}")
diff --git a/python/tests/expressions/test_literals.py 
b/python/tests/expressions/test_literals.py
index 16aee4dbc3..309bd28c4c 100644
--- a/python/tests/expressions/test_literals.py
+++ b/python/tests/expressions/test_literals.py
@@ -373,7 +373,7 @@ def test_string_to_uuid_literal() -> None:
     uuid_str = literal(str(expected))
     uuid_lit = uuid_str.to(UUIDType())
 
-    assert expected == uuid_lit.value
+    assert expected.bytes == uuid_lit.value
 
 
 def test_string_to_decimal_literal() -> None:
@@ -503,6 +503,22 @@ def test_binary_to_smaller_fixed_none() -> None:
     assert "Cannot convert BinaryLiteral into fixed[2], different length: 2 <> 
3" in str(e.value)
 
 
+def test_binary_to_uuid() -> None:
+    test_uuid = uuid.uuid4()
+    lit = literal(test_uuid.bytes)
+    uuid_lit = lit.to(UUIDType())
+    assert uuid_lit is not None
+    assert lit.value == uuid_lit.value
+    assert uuid_lit.value == test_uuid.bytes
+
+
+def test_incompatible_binary_to_uuid() -> None:
+    lit = literal(bytes([0x00, 0x01, 0x02]))
+    with pytest.raises(TypeError) as e:
+        _ = lit.to(UUIDType())
+        assert "Cannot convert BinaryLiteral into uuid, different length: 16 
<> 3" in str(e.value)
+
+
 def test_fixed_to_binary() -> None:
     lit = literal(bytes([0x00, 0x01, 0x02])).to(FixedType(3))
     binary_lit = lit.to(BinaryType())
@@ -517,6 +533,22 @@ def test_fixed_to_smaller_fixed_none() -> None:
     assert "Could not convert b'\\x00\\x01\\x02' into a fixed[2]" in 
str(e.value)
 
 
+def test_fixed_to_uuid() -> None:
+    test_uuid = uuid.uuid4()
+    lit = literal(test_uuid.bytes).to(FixedType(16))
+    uuid_lit = lit.to(UUIDType())
+    assert uuid_lit is not None
+    assert lit.value == uuid_lit.value
+    assert uuid_lit.value == test_uuid.bytes
+
+
+def test_incompatible_fixed_to_uuid() -> None:
+    lit = literal(bytes([0x00, 0x01, 0x02])).to(FixedType(3))
+    with pytest.raises(TypeError) as e:
+        _ = lit.to(UUIDType())
+        assert "Cannot convert BinaryLiteral into uuid, different length: 16 
<> 3" in str(e.value)
+
+
 def test_above_max_float() -> None:
     a = FloatAboveMax()
     # singleton
@@ -843,6 +875,13 @@ def test_decimal_literal_dencrement() -> None:
     assert dec.decrement().value.as_tuple() == Decimal("10.122").as_tuple()
 
 
+def test_uuid_literal_initialization() -> None:
+    test_uuid = uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")
+    uuid_literal = literal(test_uuid)
+    assert isinstance(uuid_literal, Literal)
+    assert test_uuid.bytes == uuid_literal.value
+
+
 #   __  __      ___
 #  |  \/  |_  _| _ \_  _
 #  | |\/| | || |  _/ || |
@@ -853,7 +892,6 @@ assert_type(literal("str"), Literal[str])
 assert_type(literal(True), Literal[bool])
 assert_type(literal(123), Literal[int])
 assert_type(literal(123.4), Literal[float])
-assert_type(literal(uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")), 
Literal[uuid.UUID])
 assert_type(literal(bytes([0x01, 0x02, 0x03])), Literal[bytes])
 assert_type(literal(Decimal("19.25")), Literal[Decimal])
 assert_type({literal(1), literal(2), literal(3)}, Set[Literal[int]])
diff --git a/python/tests/test_conversions.py b/python/tests/test_conversions.py
index 429de6e011..3b3e519579 100644
--- a/python/tests/test_conversions.py
+++ b/python/tests/test_conversions.py
@@ -270,9 +270,9 @@ def 
test_partition_to_py_raise_on_incorrect_precision_or_scale(
         (
             UUIDType(),
             b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7",
-            uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
+            b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7",
         ),
-        (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", 
uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")),
+        (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", 
b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7"),
         (FixedType(3), b"foo", b"foo"),
         (BinaryType(), b"foo", b"foo"),
         (DecimalType(5, 2), b"\x30\x39", Decimal("123.45")),
@@ -308,9 +308,9 @@ def test_from_bytes(primitive_type: PrimitiveType, b: 
bytes, result: Any) -> Non
         (
             UUIDType(),
             b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7",
-            uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
+            b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7",
         ),
-        (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", 
uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")),
+        (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", 
b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7"),
         (FixedType(3), b"foo", b"foo"),
         (BinaryType(), b"foo", b"foo"),
         (DecimalType(5, 2), b"\x30\x39", Decimal("123.45")),
@@ -341,6 +341,22 @@ def test_round_trip_conversion(primitive_type: 
PrimitiveType, b: bytes, result:
     assert bytes_from_value == b
 
 
[email protected](
+    "primitive_type, v, result",
+    [
+        (
+            UUIDType(),
+            uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
+            b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7",
+        ),
+        (UUIDType(), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 
b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7"),
+    ],
+)
+def test_uuid_to_bytes(primitive_type: PrimitiveType, v: Any, result: bytes) 
-> None:
+    bytes_from_value = conversions.to_bytes(primitive_type, v)
+    assert bytes_from_value == result
+
+
 @pytest.mark.parametrize(
     "primitive_type, b, result",
     [
diff --git a/python/tests/test_integration.py b/python/tests/test_integration.py
index 37ba5b9048..9a3e044e21 100644
--- a/python/tests/test_integration.py
+++ b/python/tests/test_integration.py
@@ -17,6 +17,7 @@
 # pylint:disable=redefined-outer-name
 
 import math
+import uuid
 from urllib.parse import urlparse
 
 import pyarrow.parquet as pq
@@ -27,9 +28,11 @@ from pyiceberg.catalog import Catalog, load_catalog
 from pyiceberg.exceptions import NoSuchTableError
 from pyiceberg.expressions import (
     And,
+    EqualTo,
     GreaterThanOrEqual,
     IsNaN,
     LessThan,
+    NotEqualTo,
     NotNaN,
 )
 from pyiceberg.io.pyarrow import pyarrow_to_schema
@@ -315,3 +318,37 @@ def test_partitioned_tables(catalog: Catalog) -> None:
         table = catalog.load_table(f"default.{table_name}")
         arrow_table = table.scan(selected_fields=("number",), 
row_filter=predicate).to_arrow()
         assert set(arrow_table["number"].to_pylist()) == {5, 6, 7, 8, 9, 10, 
11, 12}, f"Table {table_name}, predicate {predicate}"
+
+
[email protected]
+def test_unpartitioned_uuid_table(catalog: Catalog) -> None:
+    unpartitioned_uuid = 
catalog.load_table("default.test_uuid_and_fixed_unpartitioned")
+    arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == 
'102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow()
+    assert arrow_table_eq["uuid_col"].to_pylist() == 
[uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967").bytes]
+
+    arrow_table_neq = unpartitioned_uuid.scan(
+        row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and 
uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'"
+    ).to_arrow()
+    assert arrow_table_neq["uuid_col"].to_pylist() == [
+        uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226").bytes,
+        uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b").bytes,
+        uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e").bytes,
+    ]
+
+
[email protected]
+def test_unpartitioned_fixed_table(catalog: Catalog) -> None:
+    fixed_table = 
catalog.load_table("default.test_uuid_and_fixed_unpartitioned")
+    arrow_table_eq = fixed_table.scan(row_filter=EqualTo("fixed_col", 
b"1234567890123456789012345")).to_arrow()
+    assert arrow_table_eq["fixed_col"].to_pylist() == 
[b"1234567890123456789012345"]
+
+    arrow_table_neq = fixed_table.scan(
+        row_filter=And(
+            NotEqualTo("fixed_col", b"1234567890123456789012345"), 
NotEqualTo("uuid_col", "c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b")
+        )
+    ).to_arrow()
+    assert arrow_table_neq["fixed_col"].to_pylist() == [
+        b"1231231231231231231231231",
+        b"12345678901234567ass12345",
+        b"qweeqwwqq1231231231231111",
+    ]
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
index 76dddb6486..d3400b6266 100644
--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -21,20 +21,56 @@ from typing import Any, Dict
 import pytest
 
 from pyiceberg import schema
+from pyiceberg.exceptions import ResolveError
 from pyiceberg.expressions import Accessor
-from pyiceberg.schema import Schema, build_position_accessors, prune_columns
+from pyiceberg.schema import (
+    Schema,
+    build_position_accessors,
+    promote,
+    prune_columns,
+)
 from pyiceberg.typedef import EMPTY_DICT, StructProtocol
 from pyiceberg.types import (
+    BinaryType,
     BooleanType,
+    DateType,
+    DecimalType,
+    DoubleType,
+    FixedType,
     FloatType,
+    IcebergType,
     IntegerType,
     ListType,
+    LongType,
     MapType,
     NestedField,
     StringType,
     StructType,
+    TimestampType,
+    TimestamptzType,
+    TimeType,
+    UUIDType,
 )
 
+TEST_PRIMITIVE_TYPES = [
+    BooleanType(),
+    IntegerType(),
+    LongType(),
+    FloatType(),
+    DoubleType(),
+    DecimalType(10, 2),
+    DecimalType(100, 2),
+    StringType(),
+    DateType(),
+    TimeType(),
+    TimestamptzType(),
+    TimestampType(),
+    BinaryType(),
+    FixedType(16),
+    FixedType(20),
+    UUIDType(),
+]
+
 
 def test_schema_str(table_schema_simple: Schema) -> None:
     """Test casting a schema to a string"""
@@ -738,3 +774,37 @@ def test_schema_select_cant_be_found(table_schema_nested: 
Schema) -> None:
     with pytest.raises(ValueError) as exc_info:
         table_schema_nested.select("BAZ", case_sensitive=True)
     assert "Could not find column: 'BAZ'" in str(exc_info.value)
+
+
+def should_promote(file_type: IcebergType, read_type: IcebergType) -> bool:
+    if isinstance(file_type, IntegerType) and isinstance(read_type, LongType):
+        return True
+    if isinstance(file_type, FloatType) and isinstance(read_type, DoubleType):
+        return True
+    if isinstance(file_type, StringType) and isinstance(read_type, BinaryType):
+        return True
+    if isinstance(file_type, BinaryType) and isinstance(read_type, StringType):
+        return True
+    if isinstance(file_type, DecimalType) and isinstance(read_type, 
DecimalType):
+        return file_type.precision <= read_type.precision and file_type.scale 
== file_type.scale
+    if isinstance(file_type, FixedType) and isinstance(read_type, UUIDType) 
and len(file_type) == 16:
+        return True
+    return False
+
+
[email protected](
+    "file_type",
+    TEST_PRIMITIVE_TYPES,
+)
[email protected](
+    "read_type",
+    TEST_PRIMITIVE_TYPES,
+)
+def test_promotion(file_type: IcebergType, read_type: IcebergType) -> None:
+    if file_type == read_type:
+        return
+    if should_promote(file_type, read_type):
+        assert promote(file_type, read_type) == read_type
+    else:
+        with pytest.raises(ResolveError):
+            promote(file_type, read_type)
diff --git a/python/tests/test_transforms.py b/python/tests/test_transforms.py
index 5c4980d8ac..8d2fe19905 100644
--- a/python/tests/test_transforms.py
+++ b/python/tests/test_transforms.py
@@ -117,6 +117,7 @@ from pyiceberg.utils.datetime import (
         (b"\x00\x01\x02\x03", FixedType(4), -188683207),
         ("iceberg", StringType(), 1210000089),
         (UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType(), 1488055340),
+        (b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", UUIDType(), 1488055340),
     ],
 )
 def test_bucket_hash_values(test_input: Any, test_type: PrimitiveType, 
expected: Any) -> None:
@@ -138,6 +139,11 @@ def test_bucket_hash_values(test_input: Any, test_type: 
PrimitiveType, expected:
             UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
             40,
         ),
+        (
+            BucketTransform(100).transform(UUIDType()),
+            b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7",
+            40,
+        ),
         (BucketTransform(128).transform(FixedType(3)), b"foo", 32),
         (BucketTransform(128).transform(BinaryType()), b"\x00\x01\x02\x03", 
57),
     ],

Reply via email to