This is an automated email from the ASF dual-hosted git repository.
gooch pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new a54ba55 [Python] support BucketByteBuffer and BucketUUID (#2836)
a54ba55 is described below
commit a54ba55095c52c979ee245c45c860e2855b44b28
Author: jun-he <[email protected]>
AuthorDate: Wed Jul 28 12:34:22 2021 -0700
[Python] support BucketByteBuffer and BucketUUID (#2836)
* [Python] support BucketByteBuffer and BucketUUID
* Add additional unit tests for bucket hash methods.
---
python/iceberg/api/transforms/bucket.py | 21 ++++++++++---
python/tests/api/test_partition_spec.py | 16 +++++-----
python/tests/api/transforms/test_bucket.py | 49 ++++++++++++++++++++++++++++++
3 files changed, 72 insertions(+), 14 deletions(-)
diff --git a/python/iceberg/api/transforms/bucket.py
b/python/iceberg/api/transforms/bucket.py
index 64db712..2884b81 100644
--- a/python/iceberg/api/transforms/bucket.py
+++ b/python/iceberg/api/transforms/bucket.py
@@ -28,6 +28,7 @@ from ..expressions import (Expressions,
Operation)
from ..types.types import (IntegerType,
TypeID)
+from ...api.types.conversions import Conversions
class Bucket(Transform):
@@ -69,7 +70,7 @@ class Bucket(Transform):
return Bucket.__class__, self.n
def __repr__(self):
- return "Bucket(n)" % self.n
+ return "Bucket[%s]" % self.n
def __str__(self):
return "bucket[%s]" % self.n
@@ -167,14 +168,24 @@ class BucketString(Bucket):
class BucketByteBuffer(Bucket):
def __init__(self, n):
- # super(BucketByteBuffer, self).__init__(n)
- raise NotImplementedError()
+ super(BucketByteBuffer, self).__init__(n)
+
+ def hash(self, value):
+ return Bucket.MURMUR3.hash(value)
+
+ def can_transform(self, type_var):
+ return type_var.type_id in [TypeID.BINARY, TypeID.FIXED]
class BucketUUID(Bucket):
def __init__(self, n):
- # super(BucketUUID, self).__init__(n)
- raise NotImplementedError()
+ super(BucketUUID, self).__init__(n)
+
+ def hash(self, value):
+ return Bucket.MURMUR3.hash(Conversions.to_byte_buffer(TypeID.UUID,
value))
+
+ def can_transform(self, type_var):
+ return type_var.type_id == TypeID.UUID
def to_bytes(n, length, byteorder='big'):
diff --git a/python/tests/api/test_partition_spec.py
b/python/tests/api/test_partition_spec.py
index 93cee14..70ab574 100644
--- a/python/tests/api/test_partition_spec.py
+++ b/python/tests/api/test_partition_spec.py
@@ -30,9 +30,9 @@ from iceberg.api.types import (BinaryType,
from tests.api.test_helpers import TestHelpers
-class TestConversions(unittest.TestCase):
+class TestPartitionSpec(unittest.TestCase):
- def test_transforms(self):
+ def test_partition_spec(self):
schema = Schema(NestedField.required(1, "i", IntegerType.get()),
NestedField.required(2, "l", LongType.get()),
NestedField.required(3, "d", DateType.get()),
@@ -60,10 +60,9 @@ class TestConversions(unittest.TestCase):
PartitionSpec.builder_for(schema).bucket("ts", 128).build(),
PartitionSpec.builder_for(schema).bucket("dec", 128).build(),
PartitionSpec.builder_for(schema).bucket("s", 128).build(),
- # todo support them
- # PartitionSpec.builder_for(schema).bucket("u", 128).build(),
- # PartitionSpec.builder_for(schema).bucket("f", 128).build(),
- # PartitionSpec.builder_for(schema).bucket("b", 128).build(),
+ PartitionSpec.builder_for(schema).bucket("u", 128).build(),
+ PartitionSpec.builder_for(schema).bucket("f", 128).build(),
+ PartitionSpec.builder_for(schema).bucket("b", 128).build(),
PartitionSpec.builder_for(schema).year("d").build(),
PartitionSpec.builder_for(schema).month("d").build(),
PartitionSpec.builder_for(schema).day("d").build(),
@@ -75,9 +74,8 @@ class TestConversions(unittest.TestCase):
PartitionSpec.builder_for(schema).truncate("l", 10).build(),
PartitionSpec.builder_for(schema).truncate("dec", 10).build(),
PartitionSpec.builder_for(schema).truncate("s", 10).build(),
- # todo support them
- # PartitionSpec.builder_for(schema).add_without_field_id(6,
"dec_unsupported", "unsupported").build(),
- # PartitionSpec.builder_for(schema).add(6, 1111,
"dec_unsupported", "unsupported").build(),
+ PartitionSpec.builder_for(schema).add_without_field_id(6,
"dec_unsupported", "unsupported").build(),
+ PartitionSpec.builder_for(schema).add(6, 1111,
"dec_unsupported", "unsupported").build(),
]
for spec in specs:
diff --git a/python/tests/api/transforms/test_bucket.py
b/python/tests/api/transforms/test_bucket.py
new file mode 100644
index 0000000..afda054
--- /dev/null
+++ b/python/tests/api/transforms/test_bucket.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import decimal
+import unittest
+import uuid
+
+from iceberg.api.transforms import Transforms
+from iceberg.api.types import (BinaryType,
+ DateType,
+ DecimalType,
+ FixedType,
+ IntegerType,
+ LongType,
+ StringType,
+ TimestampType,
+ TimeType,
+ UUIDType)
+
+
+class TestBucket(unittest.TestCase):
+
+ def test_bucket_hash(self):
+ buckets = [
+ [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379],
+ [Transforms.bucket(LongType.get(), 100), 34, 2017239379],
+ [Transforms.bucket(DateType.get(), 100), 17486, -653330422],
+ [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989],
+ [Transforms.bucket(TimestampType.without_timezone(), 100),
1510871468000000, -2047944441],
+ [Transforms.bucket(DecimalType.of(9, 2), 100),
decimal.Decimal("14.20"), -500754589],
+ [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089],
+ [Transforms.bucket(UUIDType.get(), 100),
uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340],
+ [Transforms.bucket(FixedType.of_length(3), 128), b'foo',
-156908512],
+ [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03',
-188683207]
+ ]
+
+ for bucket in buckets:
+ self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))