wgtmac commented on code in PR #35731:
URL: https://github.com/apache/arrow/pull/35731#discussion_r1203469606
##########
cpp/src/parquet/bloom_filter.h:
##########
@@ -49,6 +49,11 @@ class PARQUET_EXPORT BloomFilter {
/// @param hash the hash of value to insert into Bloom filter.
virtual void InsertHash(uint64_t hash) = 0;
+ /// Insert element to set represented by Bloom filter bitset.
Review Comment:
```suggestion
/// Insert elements to set represented by Bloom filter bitset.
```
##########
cpp/src/parquet/bloom_filter_test.cc:
##########
@@ -326,11 +326,31 @@ TEST(XxHashTest, TestBloomFilter) {
uint8_t bytes[32] = {};
for (int i = 0; i < 32; i++) {
- ByteArray byteArray(i, bytes);
+ ByteArray byte_array(i, bytes);
bytes[i] = i;
auto hasher_seed_0 = std::make_unique<XxHasher>();
- EXPECT_EQ(HASHES_OF_LOOPING_BYTES_WITH_SEED_0[i],
hasher_seed_0->Hash(&byteArray))
+ EXPECT_EQ(HASHES_OF_LOOPING_BYTES_WITH_SEED_0[i],
hasher_seed_0->Hash(&byte_array))
+ << "Hash with seed 0 Error: " << i;
+ }
+}
+
+// Same as TestBloomFilter but using Batch interface
Review Comment:
Would be good to cover different types, though it is missing in the original
test...
##########
cpp/src/parquet/hasher.h:
##########
@@ -66,6 +66,58 @@ class Hasher {
/// @param len the value length.
virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+ /// Batch compute hashes for 32 bits values by using its plain encoding
result.
+ ///
+ /// @param values the value to hash.
+ /// @param num_values the number of values to hash.
+ /// @param hashes the output hash value, it length should be equal to
num_values.
+ virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes)
const = 0;
Review Comment:
Is `BatchHash` a better name?
##########
cpp/src/parquet/bloom_filter.h:
##########
@@ -49,6 +49,11 @@ class PARQUET_EXPORT BloomFilter {
/// @param hash the hash of value to insert into Bloom filter.
virtual void InsertHash(uint64_t hash) = 0;
+ /// Insert element to set represented by Bloom filter bitset.
+ /// @param hashes the hash of value to insert into Bloom filter.
Review Comment:
```suggestion
/// @param hashes the hash values to insert into Bloom filter.
```
##########
cpp/src/parquet/bloom_filter.h:
##########
@@ -200,6 +257,11 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public
BloomFilter {
bool FindHash(uint64_t hash) const override;
void InsertHash(uint64_t hash) override;
+ void InsertHashes(const uint64_t* hashes, int num_values) override {
+ for (int i = 0; i < num_values; ++i) {
+ InsertHash(hashes[i]);
Review Comment:
Implement a `InsertHashInternal` to avoid multiple virtual calls here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]