This is an automated email from the ASF dual-hosted git repository.

gooch pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new a54ba55  [Python] support BucketByteBuffer and BucketUUID (#2836)
a54ba55 is described below

commit a54ba55095c52c979ee245c45c860e2855b44b28
Author: jun-he <[email protected]>
AuthorDate: Wed Jul 28 12:34:22 2021 -0700

    [Python] support BucketByteBuffer and BucketUUID (#2836)
    
    * [Python] support BucketByteBuffer and BucketUUID
    
    * Add additional unit tests for bucket hash methods.
---
 python/iceberg/api/transforms/bucket.py    | 21 ++++++++++---
 python/tests/api/test_partition_spec.py    | 16 +++++-----
 python/tests/api/transforms/test_bucket.py | 49 ++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/python/iceberg/api/transforms/bucket.py 
b/python/iceberg/api/transforms/bucket.py
index 64db712..2884b81 100644
--- a/python/iceberg/api/transforms/bucket.py
+++ b/python/iceberg/api/transforms/bucket.py
@@ -28,6 +28,7 @@ from ..expressions import (Expressions,
                            Operation)
 from ..types.types import (IntegerType,
                            TypeID)
+from ...api.types.conversions import Conversions
 
 
 class Bucket(Transform):
@@ -69,7 +70,7 @@ class Bucket(Transform):
         return Bucket.__class__, self.n
 
     def __repr__(self):
-        return "Bucket(n)" % self.n
+        return "Bucket[%s]" % self.n
 
     def __str__(self):
         return "bucket[%s]" % self.n
@@ -167,14 +168,24 @@ class BucketString(Bucket):
 
 class BucketByteBuffer(Bucket):
     def __init__(self, n):
-        # super(BucketByteBuffer, self).__init__(n)
-        raise NotImplementedError()
+        super(BucketByteBuffer, self).__init__(n)
+
+    def hash(self, value):
+        return Bucket.MURMUR3.hash(value)
+
+    def can_transform(self, type_var):
+        return type_var.type_id in [TypeID.BINARY, TypeID.FIXED]
 
 
 class BucketUUID(Bucket):
     def __init__(self, n):
-        # super(BucketUUID, self).__init__(n)
-        raise NotImplementedError()
+        super(BucketUUID, self).__init__(n)
+
+    def hash(self, value):
+        return Bucket.MURMUR3.hash(Conversions.to_byte_buffer(TypeID.UUID, 
value))
+
+    def can_transform(self, type_var):
+        return type_var.type_id == TypeID.UUID
 
 
 def to_bytes(n, length, byteorder='big'):
diff --git a/python/tests/api/test_partition_spec.py 
b/python/tests/api/test_partition_spec.py
index 93cee14..70ab574 100644
--- a/python/tests/api/test_partition_spec.py
+++ b/python/tests/api/test_partition_spec.py
@@ -30,9 +30,9 @@ from iceberg.api.types import (BinaryType,
 from tests.api.test_helpers import TestHelpers
 
 
-class TestConversions(unittest.TestCase):
+class TestPartitionSpec(unittest.TestCase):
 
-    def test_transforms(self):
+    def test_partition_spec(self):
         schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                         NestedField.required(2, "l", LongType.get()),
                         NestedField.required(3, "d", DateType.get()),
@@ -60,10 +60,9 @@ class TestConversions(unittest.TestCase):
                  PartitionSpec.builder_for(schema).bucket("ts", 128).build(),
                  PartitionSpec.builder_for(schema).bucket("dec", 128).build(),
                  PartitionSpec.builder_for(schema).bucket("s", 128).build(),
-                 # todo support them
-                 # PartitionSpec.builder_for(schema).bucket("u", 128).build(),
-                 # PartitionSpec.builder_for(schema).bucket("f", 128).build(),
-                 # PartitionSpec.builder_for(schema).bucket("b", 128).build(),
+                 PartitionSpec.builder_for(schema).bucket("u", 128).build(),
+                 PartitionSpec.builder_for(schema).bucket("f", 128).build(),
+                 PartitionSpec.builder_for(schema).bucket("b", 128).build(),
                  PartitionSpec.builder_for(schema).year("d").build(),
                  PartitionSpec.builder_for(schema).month("d").build(),
                  PartitionSpec.builder_for(schema).day("d").build(),
@@ -75,9 +74,8 @@ class TestConversions(unittest.TestCase):
                  PartitionSpec.builder_for(schema).truncate("l", 10).build(),
                  PartitionSpec.builder_for(schema).truncate("dec", 10).build(),
                  PartitionSpec.builder_for(schema).truncate("s", 10).build(),
-                 # todo support them
-                 # PartitionSpec.builder_for(schema).add_without_field_id(6, 
"dec_unsupported", "unsupported").build(),
-                 # PartitionSpec.builder_for(schema).add(6, 1111, 
"dec_unsupported", "unsupported").build(),
+                 PartitionSpec.builder_for(schema).add_without_field_id(6, 
"dec_unsupported", "unsupported").build(),
+                 PartitionSpec.builder_for(schema).add(6, 1111, 
"dec_unsupported", "unsupported").build(),
                  ]
 
         for spec in specs:
diff --git a/python/tests/api/transforms/test_bucket.py 
b/python/tests/api/transforms/test_bucket.py
new file mode 100644
index 0000000..afda054
--- /dev/null
+++ b/python/tests/api/transforms/test_bucket.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import decimal
+import unittest
+import uuid
+
+from iceberg.api.transforms import Transforms
+from iceberg.api.types import (BinaryType,
+                               DateType,
+                               DecimalType,
+                               FixedType,
+                               IntegerType,
+                               LongType,
+                               StringType,
+                               TimestampType,
+                               TimeType,
+                               UUIDType)
+
+
+class TestBucket(unittest.TestCase):
+
+    def test_bucket_hash(self):
+        buckets = [
+            [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379],
+            [Transforms.bucket(LongType.get(), 100), 34, 2017239379],
+            [Transforms.bucket(DateType.get(), 100), 17486, -653330422],
+            [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989],
+            [Transforms.bucket(TimestampType.without_timezone(), 100), 
1510871468000000, -2047944441],
+            [Transforms.bucket(DecimalType.of(9, 2), 100), 
decimal.Decimal("14.20"), -500754589],
+            [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089],
+            [Transforms.bucket(UUIDType.get(), 100), 
uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340],
+            [Transforms.bucket(FixedType.of_length(3), 128), b'foo', 
-156908512],
+            [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', 
-188683207]
+        ]
+
+        for bucket in buckets:
+            self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))

Reply via email to