[GitHub] [parquet-mr] wgtmac commented on a diff in pull request #1020: PARQUET-2226 Support merge bloom filters

GitBox Sun, 15 Jan 2023 05:33:43 -0800


wgtmac commented on code in PR #1020:
URL: https://github.com/apache/parquet-mr/pull/1020#discussion_r1070591970



##########
parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java:
##########
@@ -398,18 +398,21 @@ public long hash(Binary value) {
 
   @Override
   public void merge(BloomFilter otherBloomFilter) throws IOException {
-    Preconditions.checkArgument((otherBloomFilter.getAlgorithm() == 
getAlgorithm()),
-      "BloomFilter algorithm should be same");
-    Preconditions.checkArgument((otherBloomFilter.getHashStrategy() == 
getHashStrategy()),
-      "BloomFilter hashStrategy should be same");
-    Preconditions.checkArgument((otherBloomFilter.getBitsetSize() == 
getBitsetSize()),
-      "BloomFilter bitset size should be same");
+    Preconditions.checkArgument(otherBloomFilter != null, "Cannot merge a null 
BloomFilter");
+    Preconditions.checkArgument((getAlgorithm() == 
otherBloomFilter.getAlgorithm()),

Review Comment:
   nit: you don't have to write String.format explicitly
   ```
       Preconditions.checkArgument((getAlgorithm() == 
otherBloomFilter.getAlgorithm()),
               "BloomFilters must have the same algorithm (%s != %s)",
                       getAlgorithm(), otherBloomFilter.getAlgorithm());
   ```



##########
parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java:
##########
@@ -394,4 +395,24 @@ public long hash(float value) {
   public long hash(Binary value) {
     return hashFunction.hashBytes(value.getBytes());
   }
+
+  @Override
+  public void merge(BloomFilter otherBloomFilter) throws IOException {
+    Preconditions.checkArgument(otherBloomFilter != null, "Cannot merge a null 
BloomFilter");
+    Preconditions.checkArgument((getAlgorithm() == 
otherBloomFilter.getAlgorithm()),
+      String.format("BloomFilters must have the same algorithm (%s != %s)",
+        getAlgorithm(), otherBloomFilter.getAlgorithm()));
+    Preconditions.checkArgument((getHashStrategy() == 
otherBloomFilter.getHashStrategy()),
+      String.format("BloomFilters must have the same hashStrategy (%s != %s)",
+        getHashStrategy(), otherBloomFilter.getHashStrategy()));
+    Preconditions.checkArgument((getBitsetSize() == 
otherBloomFilter.getBitsetSize()),
+      String.format("BloomFilters must have the same size of bitsets (%s != 
%s)",
+        getBitsetSize(), otherBloomFilter.getBitsetSize()));
+    ByteArrayOutputStream otherOutputStream = new ByteArrayOutputStream();
+    otherBloomFilter.writeTo(otherOutputStream);
+    byte[] otherBits = otherOutputStream.toByteArray();

Review Comment:
   should we check the equality of bitset.length and otherBits.length?



##########
parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java:
##########
@@ -181,6 +182,60 @@ public void testBloomFilterNDVs(){
     assertTrue(bytes < 5 * 1024 * 1024);
   }
 
+  @Test
+  public void testMergeBloomFilter() throws IOException {
+    Random random = new Random();
+    int numBytes = BlockSplitBloomFilter.optimalNumOfBits(1024 * 1024, 0.01) / 
8;
+    BloomFilter otherBloomFilter = new BlockSplitBloomFilter(numBytes);
+    BloomFilter mergedBloomFilter = new BlockSplitBloomFilter(numBytes);
+
+    Set<String> originStrings = new HashSet<>();
+    Set<String> testStrings = new HashSet<>();
+    Set<Integer> testInts = new HashSet<>();
+    Set<Double> testDoubles = new HashSet<>();
+    Set<Float> testFloats = new HashSet<>();
+    for (int i = 0; i < 1024; i++) {
+
+      String originStrValue = RandomStringUtils.randomAlphabetic(1, 64);
+      originStrings.add(originStrValue);
+      
mergedBloomFilter.insertHash(otherBloomFilter.hash(Binary.fromString(originStrValue)));
+
+      String testString = RandomStringUtils.randomAlphabetic(1, 64);
+      testStrings.add(testString);
+      
otherBloomFilter.insertHash(otherBloomFilter.hash(Binary.fromString(testString)));
+
+      int testInt = random.nextInt();
+      testInts.add(testInt);
+      otherBloomFilter.insertHash(otherBloomFilter.hash(testInt));
+
+      double testDouble = random.nextDouble();
+      testDoubles.add(testDouble);
+      otherBloomFilter.insertHash(otherBloomFilter.hash(testDouble));
+
+      float testFloat = random.nextFloat();
+      testFloats.add(testFloat);
+      otherBloomFilter.insertHash(otherBloomFilter.hash(testFloat));
+    }
+    mergedBloomFilter.merge(otherBloomFilter);
+    for (String testString : originStrings) {
+      
assertTrue(mergedBloomFilter.findHash(mergedBloomFilter.hash(Binary.fromString(testString))));

Review Comment:
   In this case, I'd suggest not use random values.



##########
parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java:
##########
@@ -181,6 +182,60 @@ public void testBloomFilterNDVs(){
     assertTrue(bytes < 5 * 1024 * 1024);
   }
 
+  @Test
+  public void testMergeBloomFilter() throws IOException {
+    Random random = new Random();
+    int numBytes = BlockSplitBloomFilter.optimalNumOfBits(1024 * 1024, 0.01) / 
8;
+    BloomFilter otherBloomFilter = new BlockSplitBloomFilter(numBytes);
+    BloomFilter mergedBloomFilter = new BlockSplitBloomFilter(numBytes);
+
+    Set<String> originStrings = new HashSet<>();
+    Set<String> testStrings = new HashSet<>();
+    Set<Integer> testInts = new HashSet<>();
+    Set<Double> testDoubles = new HashSet<>();
+    Set<Float> testFloats = new HashSet<>();
+    for (int i = 0; i < 1024; i++) {
+
+      String originStrValue = RandomStringUtils.randomAlphabetic(1, 64);
+      originStrings.add(originStrValue);
+      
mergedBloomFilter.insertHash(otherBloomFilter.hash(Binary.fromString(originStrValue)));
+
+      String testString = RandomStringUtils.randomAlphabetic(1, 64);
+      testStrings.add(testString);
+      
otherBloomFilter.insertHash(otherBloomFilter.hash(Binary.fromString(testString)));
+
+      int testInt = random.nextInt();
+      testInts.add(testInt);
+      otherBloomFilter.insertHash(otherBloomFilter.hash(testInt));
+
+      double testDouble = random.nextDouble();
+      testDoubles.add(testDouble);
+      otherBloomFilter.insertHash(otherBloomFilter.hash(testDouble));
+
+      float testFloat = random.nextFloat();
+      testFloats.add(testFloat);
+      otherBloomFilter.insertHash(otherBloomFilter.hash(testFloat));
+    }
+    mergedBloomFilter.merge(otherBloomFilter);
+    for (String testString : originStrings) {
+      
assertTrue(mergedBloomFilter.findHash(mergedBloomFilter.hash(Binary.fromString(testString))));

Review Comment:
   Actually it does not matter what the data type is. We can simplify the test 
by writing two lists of hash values to create two BFs. Then we are pretty sure 
the test result of each value. What do you think?



##########
parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java:
##########
@@ -181,6 +182,83 @@ public void testBloomFilterNDVs(){
     assertTrue(bytes < 5 * 1024 * 1024);
   }
 
+  @Test
+  public void testMergeEmptyBloomFilter() throws IOException {

Review Comment:
   Probably we need a test to expect throwing exception when two BFs are not 
compatible.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [parquet-mr] wgtmac commented on a diff in pull request #1020: PARQUET-2226 Support merge bloom filters

Reply via email to