tvalentyn commented on code in PR #31249:
URL: https://github.com/apache/beam/pull/31249#discussion_r1597309758
##########
sdks/python/apache_beam/ml/transforms/tft.py:
##########
@@ -637,3 +637,48 @@ def apply_transform(self, data: tf.SparseTensor,
output_col_name: str):
def count_unique_words(
data: tf.SparseTensor, output_vocab_name: Optional[str]) -> None:
tft.count_per_key(data, key_vocabulary_filename=output_vocab_name)
+
+
+@register_input_dtype(str)
+class HashStrings(TFTOperation):
+ def __init__(
+ self,
+ columns: List[str],
+ hash_buckets: int,
+ key: Optional[Iterable[int]] = None,
+ name: Optional[str] = None):
+ '''Hashes strings into the provided number of buckets.
+
+ Args:
+ columns: A list of the column names to apply the transformation on.
+ hash_buckets: the number of buckets to hash the strings into.
+ key: optional. An array of two Python `uint64`. If passed, output will be
Review Comment:
should this be a Tuple[int, int]?
##########
sdks/python/apache_beam/ml/transforms/tft.py:
##########
@@ -637,3 +637,48 @@ def apply_transform(self, data: tf.SparseTensor,
output_col_name: str):
def count_unique_words(
data: tf.SparseTensor, output_vocab_name: Optional[str]) -> None:
tft.count_per_key(data, key_vocabulary_filename=output_vocab_name)
+
+
+@register_input_dtype(str)
+class HashStrings(TFTOperation):
+ def __init__(
+ self,
+ columns: List[str],
+ hash_buckets: int,
+ key: Optional[Iterable[int]] = None,
+ name: Optional[str] = None):
+ '''Hashes strings into the provided number of buckets.
+
+ Args:
+ columns: A list of the column names to apply the transformation on.
+ hash_buckets: the number of buckets to hash the strings into.
+ key: optional. An array of two Python `uint64`. If passed, output will be
+ a deterministic function of `strings` and `key`. Note that hashing will
+ be slower if this value is specified.
+ name: optional. A name for this operation.
+
+ Raises:
+ ValueError if `hash_buckets` is not a positive and non-zero integer.
+ '''
+ self.hash_buckets = hash_buckets
+ self.key = key
+ self.name = name
+
+ if hash_buckets < 1:
+ raise ValueError(
+ 'number of hash buckets must be positive and non-zero, got ',
Review Comment:
```suggestion
'number of hash buckets must be positive, got ',
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]