rdblue commented on code in PR #5124:
URL: https://github.com/apache/iceberg/pull/5124#discussion_r918348241
##########
python/pyiceberg/transforms.py:
##########
@@ -130,99 +161,58 @@ def apply(self, value: Optional[S]) -> Optional[int]:
def result_type(self, source: IcebergType) -> IcebergType:
return IntegerType()
- @abstractmethod
- def can_transform(self, source: IcebergType) -> bool:
- pass
-
- def __repr__(self) -> str:
- return f"transforms.bucket(source_type={repr(self._source_type)},
num_buckets={self._num_buckets})"
-
-
-class BucketNumberTransform(BaseBucketTransform):
- """Transforms a value of IntegerType, LongType, DateType, TimeType,
TimestampType, or TimestamptzType
- into a bucket partition value
-
- Example:
- >>> transform = BucketNumberTransform(LongType(), 100)
- >>> transform.apply(81068000000)
- 59
- """
-
def can_transform(self, source: IcebergType) -> bool:
- return type(source) in {IntegerType, DateType, LongType, TimeType,
TimestampType, TimestamptzType}
-
- def hash(self, value) -> int:
- return mmh3.hash(struct.pack("<q", value))
-
-
-class BucketDecimalTransform(BaseBucketTransform):
- """Transforms a value of DecimalType into a bucket partition value.
-
- Example:
- >>> transform = BucketDecimalTransform(DecimalType(9, 2), 100)
- >>> transform.apply(Decimal("14.20"))
- 59
- """
-
- def can_transform(self, source: IcebergType) -> bool:
- return isinstance(source, DecimalType)
-
- def hash(self, value: Decimal) -> int:
- return mmh3.hash(decimal_to_bytes(value))
-
-
-class BucketStringTransform(BaseBucketTransform):
- """Transforms a value of StringType into a bucket partition value.
-
- Example:
- >>> transform = BucketStringTransform(StringType(), 100)
- >>> transform.apply("iceberg")
- 89
- """
-
- def can_transform(self, source: IcebergType) -> bool:
- return isinstance(source, StringType)
-
- def hash(self, value: str) -> int:
- return mmh3.hash(value)
-
-
-class BucketBytesTransform(BaseBucketTransform):
- """Transforms a value of FixedType or BinaryType into a bucket partition
value.
-
- Example:
- >>> transform = BucketBytesTransform(BinaryType(), 100)
- >>> transform.apply(b"\\x00\\x01\\x02\\x03")
- 41
- """
-
- def can_transform(self, source: IcebergType) -> bool:
- return type(source) in {FixedType, BinaryType}
-
- def hash(self, value: bytes) -> int:
- return mmh3.hash(value)
+ return type(source) in {
+ IntegerType,
+ DateType,
+ LongType,
+ TimeType,
+ TimestampType,
+ TimestamptzType,
+ DecimalType,
+ StringType,
+ FixedType,
+ BinaryType,
+ UUIDType,
+ }
+
+ def hash_function(self, source: IcebergType, bucket: bool = True) ->
Callable[[Optional[Any]], Optional[int]]:
+ source_type = type(source)
+ if source_type in {IntegerType, LongType, DateType, TimeType,
TimestampType, TimestamptzType}:
+
+ def hash_func(v):
+ return mmh3.hash(struct.pack("<q", v))
+
+ elif source_type == DecimalType:
+
+ def hash_func(v):
+ return mmh3.hash(decimal_to_bytes(v))
+
+ elif source_type in {StringType, FixedType, BinaryType}:
+
+ def hash_func(v):
+ return mmh3.hash(v)
+
+ elif source_type == UUIDType:
+
+ def hash_func(v):
Review Comment:
Is there a performance penalty for defining functions like this? We could
also make these top-level, like `def _hash_uuid(uuid: UUID) -> int: ...` and
return references here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]