This is an automated email from the ASF dual-hosted git repository.
yihua pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 29a59d2318a [HUDI-6850] Add tests and docs for ported Bloom Filter
classes (#9700)
29a59d2318a is described below
commit 29a59d2318af5c86d7e7eb45655a3ac06d3d278e
Author: Y Ethan Guo <[email protected]>
AuthorDate: Wed Sep 13 10:22:57 2023 -0700
[HUDI-6850] Add tests and docs for ported Bloom Filter classes (#9700)
---
LICENSE | 15 ++++-
.../org/apache/hudi/common/bloom/HashFunction.java | 35 ++++++++++-
.../hudi/common/bloom/InternalBloomFilter.java | 3 +
.../apache/hudi/common/bloom/InternalFilter.java | 30 +++++++++-
.../java/org/apache/hudi/common/bloom/Key.java | 4 +-
.../org/apache/hudi/common/util/hash/Hash.java | 2 +
.../apache/hudi/common/util/hash/JenkinsHash.java | 4 +-
.../apache/hudi/common/util/hash/MurmurHash.java | 4 +-
.../apache/hudi/common/bloom/TestBloomFilter.java | 70 ++++++++++++++++++++++
.../hudi/common/table/log/TestLogReaderUtils.java | 11 +---
.../hudi/common/testutils/FileSystemTestUtils.java | 10 ++++
.../hudi/common/testutils/HoodieTestUtils.java | 4 +-
.../format/bloom-filter/hadoop/all_10000.keys.data | 19 ++++++
.../dynamic_1000_000001_jenkins_5000.bf.data | 19 ++++++
.../hadoop/dynamic_1000_000001_murmur_5000.bf.data | 19 ++++++
.../hadoop/dynamic_200_000001_murmur_1000.bf.data | 19 ++++++
.../hadoop/simple_10000_000001_murmur.bf.data | 19 ++++++
.../hadoop/simple_1000_000001_murmur.bf.data | 19 ++++++
.../hadoop/simple_200_000001_murmur.bf.data | 19 ++++++
.../hadoop/simple_5000_000001_jenkins.bf.data | 19 ++++++
.../hadoop/simple_5000_000001_murmur.bf.data | 19 ++++++
21 files changed, 345 insertions(+), 18 deletions(-)
diff --git a/LICENSE b/LICENSE
index 28222a717e6..301ea869628 100644
--- a/LICENSE
+++ b/LICENSE
@@ -291,7 +291,20 @@ This product includes code from Apache Hadoop
* org.apache.hudi.common.bloom.InternalDynamicBloomFilter.java adapted from
org.apache.hadoop.util.bloom.DynamicBloomFilter.java
-* org.apache.hudi.common.bloom.InternalFilter copied from classes in
org.apache.hadoop.util.bloom package
+* org.apache.hudi.common.bloom.InternalFilter.java adapted from
org.apache.hadoop.util.bloom.Filter.java
+ and org.apache.hadoop.io.Writable.java
+
+* org.apache.hudi.common.bloom.InternalBloomFilter adapted from
org.apache.hadoop.util.bloom.BloomFilter.java
+
+* org.apache.hudi.common.bloom.Key.java adapted from
org.apache.hadoop.util.bloom.Key.java
+
+* org.apache.hudi.common.bloom.HashFunction.java ported from
org.apache.hadoop.util.bloom.HashFunction.java
+
+* org.apache.hudi.common.util.hash.Hash.java ported from
org.apache.hadoop.util.hash.Hash.java
+
+* org.apache.hudi.common.util.hash.JenkinsHash.java ported from
org.apache.hadoop.util.hash.JenkinsHash.java
+
+* org.apache.hudi.common.util.hash.MurmurHash.java ported from
org.apache.hadoop.util.hash.MurmurHash.java
with the following license
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/bloom/HashFunction.java
b/hudi-common/src/main/java/org/apache/hudi/common/bloom/HashFunction.java
index e2637b10d6d..c6e6deb8727 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/HashFunction.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/HashFunction.java
@@ -16,6 +16,37 @@
* specific language governing permissions and limitations
* under the License.
*/
+/**
+ * Copyright (c) 2005, European Commission project OneLab under contract 034819
+ * (http://www.one-lab.org)
+ * <p>
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ * - Neither the name of the University Catholique de Louvain - UCL
+ * nor the names of its contributors may be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ * <p>
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
package org.apache.hudi.common.bloom;
@@ -23,11 +54,13 @@ import org.apache.hudi.common.util.hash.Hash;
/**
* Implements a hash object that returns a certain number of hashed values.
+ * <p>
+ * The code in class is ported from {@link
org.apache.hadoop.util.bloom.HashFunction} in Apache Hadoop.
*
* @see Key The general behavior of a key being stored in a bloom filter
* @see InternalBloomFilter The general behavior of a bloom filter
*/
-public class HashFunction {
+public final class HashFunction {
/**
* The number of hashed values.
*/
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalBloomFilter.java
b/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalBloomFilter.java
index 4e2c56d163f..ac93de2d58f 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalBloomFilter.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalBloomFilter.java
@@ -57,6 +57,9 @@ import java.util.BitSet;
* Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
* <p>
* The code in class is adapted from {@link
org.apache.hadoop.util.bloom.BloomFilter} in Apache Hadoop.
+ * The serialization and deserialization are completely the same as and
compatible with Hadoop's
+ * {@link org.apache.hadoop.util.bloom.BloomFilter}, so that this class
correctly reads bloom
+ * filters serialized by older Hudi versions using Hadoop's BloomFilter.
* <p>
* Hudi serializes bloom filter(s) and write them to Parquet file footers and
metadata table's
* bloom filter partition containing bloom filters for all data files. We
want to maintain the
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalFilter.java
b/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalFilter.java
index 87854edd313..6b2e46ee077 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalFilter.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/InternalFilter.java
@@ -27,7 +27,20 @@ import java.util.Collection;
import java.util.List;
/**
- * Ported from {@link org.apache.hadoop.util.bloom.Filter}.
+ * Defines the general behavior of a filter.
+ * <p>
+ * The code in class is adapted from {@link
org.apache.hadoop.util.bloom.Filter} in Apache Hadoop.
+ * <p>
+ * A filter is a data structure which aims at offering a lossy summary of a
set <code>A</code>. The
+ * key idea is to map entries of <code>A</code> (also called <i>keys</i>) into
several positions
+ * in a vector through the use of several hash functions.
+ * <p>
+ * Typically, a filter will be implemented as a Bloom filter (or a Bloom
filter extension).
+ * <p>
+ * It must be extended in order to define the real behavior.
+ *
+ * @see Key The general behavior of a key
+ * @see HashFunction A hash function
*/
abstract class InternalFilter {
private static final int VERSION = -1; // negative to accommodate for old
format
@@ -160,6 +173,12 @@ abstract class InternalFilter {
}
} //end add()
+ /**
+ * Serialize the fields of this object to <code>out</code>.
+ *
+ * @param out <code>DataOuput</code> to serialize this object into.
+ * @throws IOException
+ */
public void write(DataOutput out) throws IOException {
out.writeInt(VERSION);
out.writeInt(this.nbHash);
@@ -167,6 +186,15 @@ abstract class InternalFilter {
out.writeInt(this.vectorSize);
}
+ /**
+ * Deserialize the fields of this object from <code>in</code>.
+ *
+ * <p>For efficiency, implementations should attempt to re-use storage in the
+ * existing object where possible.</p>
+ *
+ * @param in <code>DataInput</code> to deseriablize this object from.
+ * @throws IOException
+ */
public void readFields(DataInput in) throws IOException {
int ver = in.readInt();
if (ver > 0) { // old non-versioned format
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bloom/Key.java
b/hudi-common/src/main/java/org/apache/hudi/common/bloom/Key.java
index b762f14d063..37ae6e68f73 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/Key.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/Key.java
@@ -25,10 +25,12 @@ import java.io.IOException;
/**
* The general behavior of a key that must be stored in a bloom filter.
+ * <p>
+ * The code in class is adapted from {@link org.apache.hadoop.util.bloom.Key}
in Apache Hadoop.
*
* @see InternalBloomFilter The general behavior of a bloom filter and how the
key is used.
*/
-public final class Key implements Comparable<Key> {
+public class Key implements Comparable<Key> {
/**
* Byte value of key
*/
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/Hash.java
b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/Hash.java
index 22218191674..a5e5d4a2f9a 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/Hash.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/Hash.java
@@ -24,6 +24,8 @@ import org.apache.hudi.common.bloom.InternalBloomFilter;
/**
* This class represents a common API for hashing functions used by
* {@link InternalBloomFilter}.
+ * <p>
+ * The code in class is ported from {@link org.apache.hadoop.util.hash.Hash}
in Apache Hadoop.
*/
public abstract class Hash {
/**
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/JenkinsHash.java
b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/JenkinsHash.java
index 6b7a0e01d08..a254a78970f 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/JenkinsHash.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/JenkinsHash.java
@@ -24,7 +24,9 @@ import java.io.IOException;
/**
* Produces 32-bit hash for hash table lookup.
- *
+ * <p>
+ * The code in class is ported from {@link
org.apache.hadoop.util.hash.JenkinsHash} in Apache Hadoop.
+ * <p>
* <pre>lookup3.c, by Bob Jenkins, May 2006, Public Domain.
*
* You can use this free for any purpose. It's in the public domain.
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/MurmurHash.java
b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/MurmurHash.java
index dd66da6dcdd..dcd074b881d 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/MurmurHash.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/MurmurHash.java
@@ -22,7 +22,9 @@ package org.apache.hudi.common.util.hash;
/**
* This is a very fast, non-cryptographic hash suitable for general hash-based
* lookup. See http://murmurhash.googlepages.com/ for more details.
- *
+ * <p>
+ * The code in class is ported from {@link
org.apache.hadoop.util.hash.MurmurHash} in Apache Hadoop.
+ * <p>
* <p>The C version of MurmurHash 2.0 found at that site was ported
* to Java by Andrzej Bialecki (ab at getopt org).</p>
*/
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/bloom/TestBloomFilter.java
b/hudi-common/src/test/java/org/apache/hudi/common/bloom/TestBloomFilter.java
index 552098e71bb..2e72b3737a0 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/bloom/TestBloomFilter.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/bloom/TestBloomFilter.java
@@ -18,15 +18,21 @@
package org.apache.hudi.common.bloom;
+import org.apache.hudi.common.util.hash.Hash;
+
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.UUID;
+import java.util.stream.Collectors;
+import static
org.apache.hudi.common.testutils.FileSystemTestUtils.readLastLineFromResourceFile;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
@@ -92,6 +98,51 @@ public class TestBloomFilter {
}
}
+ public static List<Arguments> bloomFilterParams() {
+ return Arrays.asList(
+ Arguments.of("hadoop", BloomFilterTypeCode.SIMPLE.name(), 200,
0.000001, Hash.MURMUR_HASH, -1),
+ Arguments.of("hadoop", BloomFilterTypeCode.SIMPLE.name(), 1000,
0.000001, Hash.MURMUR_HASH, -1),
+ Arguments.of("hadoop", BloomFilterTypeCode.SIMPLE.name(), 5000,
0.000001, Hash.MURMUR_HASH, -1),
+ Arguments.of("hadoop", BloomFilterTypeCode.SIMPLE.name(), 10000,
0.000001, Hash.MURMUR_HASH, -1),
+ Arguments.of("hadoop", BloomFilterTypeCode.SIMPLE.name(), 5000,
0.000001, Hash.JENKINS_HASH, -1),
+ Arguments.of("hadoop", BloomFilterTypeCode.DYNAMIC_V0.name(), 200,
0.000001, Hash.MURMUR_HASH, 1000),
+ Arguments.of("hadoop", BloomFilterTypeCode.DYNAMIC_V0.name(), 1000,
0.000001, Hash.MURMUR_HASH, 5000),
+ Arguments.of("hadoop", BloomFilterTypeCode.DYNAMIC_V0.name(), 1000,
0.000001, Hash.JENKINS_HASH, 5000),
+ Arguments.of("hudi", BloomFilterTypeCode.SIMPLE.name(), 1000,
0.000001, Hash.MURMUR_HASH, -1),
+ Arguments.of("hudi", BloomFilterTypeCode.SIMPLE.name(), 5000,
0.000001, Hash.MURMUR_HASH, -1),
+ Arguments.of("hudi", BloomFilterTypeCode.DYNAMIC_V0.name(), 1000,
0.000001, Hash.MURMUR_HASH, 5000)
+ );
+ }
+
+ @ParameterizedTest
+ @MethodSource("bloomFilterParams")
+ public void testDeserialize(String lib, String typeCode, int numEntries,
+ double errorRate, int hashType, int maxEntries)
throws IOException {
+ // When the "lib" = "hadoop", this tests the backwards compatibility so
that Hudi's
+ // {@link InternalBloomFilter} correctly reads the bloom filters
serialized by Hadoop
+ List<String> keyList = Arrays.stream(
+
readLastLineFromResourceFile("/format/bloom-filter/hadoop/all_10000.keys.data").split(","))
+ .collect(Collectors.toList());
+ String serializedFilter;
+ if ("hadoop".equals(lib)) {
+ String fileName =
(BloomFilterTypeCode.DYNAMIC_V0.name().equals(typeCode) ? "dynamic" : "simple")
+ + "_" + numEntries
+ + "_000001_"
+ + (hashType == Hash.MURMUR_HASH ? "murmur" : "jenkins")
+ + (BloomFilterTypeCode.DYNAMIC_V0.name().equals(typeCode) ? "_" +
maxEntries : "")
+ + ".bf.data";
+ serializedFilter =
readLastLineFromResourceFile("/format/bloom-filter/hadoop/" + fileName);
+ } else {
+ BloomFilter inputFilter = getBloomFilter(typeCode, numEntries,
errorRate, maxEntries);
+ for (String key : keyList) {
+ inputFilter.add(key);
+ }
+ serializedFilter = inputFilter.serializeToString();
+ }
+ validateBloomFilter(
+ serializedFilter, keyList, lib, typeCode, numEntries, errorRate,
hashType, maxEntries);
+ }
+
BloomFilter getBloomFilter(String typeCode, int numEntries, double
errorRate, int maxEntries) {
if (typeCode.equalsIgnoreCase(BloomFilterTypeCode.SIMPLE.name())) {
return BloomFilterFactory.createBloomFilter(numEntries, errorRate, -1,
typeCode);
@@ -99,4 +150,23 @@ public class TestBloomFilter {
return BloomFilterFactory.createBloomFilter(numEntries, errorRate,
maxEntries, typeCode);
}
}
+
+ private void validateBloomFilter(String serializedFilter, List<String>
keyList, String lib,
+ String typeCode, int numEntries, double
errorRate,
+ int hashType, int maxEntries) {
+ BloomFilter bloomFilter = BloomFilterFactory
+ .fromString(serializedFilter, typeCode);
+ for (String key : keyList) {
+ assertTrue(bloomFilter.mightContain(key), "Filter should have returned
true for " + key);
+ }
+ if ("hadoop".equals(lib) && hashType == Hash.MURMUR_HASH) {
+ BloomFilter hudiBloomFilter = getBloomFilter(typeCode, numEntries,
errorRate, maxEntries);
+ for (String key : keyList) {
+ hudiBloomFilter.add(key);
+ }
+ // Hadoop library-serialized bloom filter should be exactly the same as
Hudi one,
+ // unless we made our customization in the future
+ assertEquals(hudiBloomFilter.serializeToString(), serializedFilter);
+ }
+ }
}
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/table/log/TestLogReaderUtils.java
b/hudi-common/src/test/java/org/apache/hudi/common/table/log/TestLogReaderUtils.java
index 69b1bddc5cf..fd8e3a5cd28 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/table/log/TestLogReaderUtils.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/table/log/TestLogReaderUtils.java
@@ -19,13 +19,10 @@
package org.apache.hudi.common.table.log;
-import org.apache.hudi.common.util.FileIOUtils;
-
import org.junit.jupiter.api.Test;
import org.roaringbitmap.longlong.Roaring64NavigableMap;
import java.io.IOException;
-import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
@@ -35,6 +32,7 @@ import java.util.Random;
import java.util.Set;
import java.util.stream.Collectors;
+import static
org.apache.hudi.common.testutils.FileSystemTestUtils.readLastLineFromResourceFile;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -92,11 +90,4 @@ public class TestLogReaderUtils {
assertFalse(expectedIterator.hasNext());
assertFalse(iterator.hasNext());
}
-
- private String readLastLineFromResourceFile(String resourceName) throws
IOException {
- try (InputStream inputStream =
TestLogReaderUtils.class.getResourceAsStream(resourceName)) {
- List<String> lines = FileIOUtils.readAsUTFStringLines(inputStream);
- return lines.get(lines.size() - 1);
- }
- }
}
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java
index 82de0f3317f..e73f2bb0440 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java
@@ -21,6 +21,8 @@ package org.apache.hudi.common.testutils;
import org.apache.hudi.common.fs.inline.InLineFSUtils;
import org.apache.hudi.common.fs.inline.InLineFileSystem;
import org.apache.hudi.common.fs.inline.InMemoryFileSystem;
+import org.apache.hudi.common.table.log.TestLogReaderUtils;
+import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -30,6 +32,7 @@ import org.apache.hadoop.fs.RemoteIterator;
import java.io.File;
import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
@@ -86,4 +89,11 @@ public class FileSystemTestUtils {
}
return statuses;
}
+
+ public static String readLastLineFromResourceFile(String resourceName)
throws IOException {
+ try (InputStream inputStream =
TestLogReaderUtils.class.getResourceAsStream(resourceName)) {
+ List<String> lines = FileIOUtils.readAsUTFStringLines(inputStream);
+ return lines.get(lines.size() - 1);
+ }
+ }
}
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java
index 9dcd2851b4a..38488752e78 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java
@@ -18,7 +18,6 @@
package org.apache.hudi.common.testutils;
-import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieFileFormat;
@@ -34,6 +33,8 @@ import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import com.esotericsoftware.kryo.serializers.JavaSerializer;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.junit.jupiter.api.Assumptions;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@@ -44,7 +45,6 @@ import java.util.List;
import java.util.Objects;
import java.util.Properties;
import java.util.UUID;
-import org.junit.jupiter.api.Assumptions;
/**
* A utility class for testing.
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/all_10000.keys.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/all_10000.keys.data
new file mode 100644
index 00000000000..5d11b297de6
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/all_10000.keys.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+90d77c70-a0ef-4cc4-a376-1904e9cf2b52,38db2a3b-7e9d-4774-998f-43d3389dd828,9171563b-e57e-438a-ba10-47197df85c77,07561753-19c0-4d02-8f59-4efebe2692a8,ccc09818-13bf-4024-af7e-c39b160539d3,de0088d1-33a4-4df1-86ef-b2fd8db2484b,7a1b5242-1c29-4c62-a2e9-452c22944a2f,bce526bf-471b-462b-b98c-138ec44a8f2d,7aa186da-4f51-49f8-bde0-e4b375968b1f,7ed111bd-5b3e-4381-8842-df54a4b7ef4b,7f05efb8-53c9-459e-b9b2-fd29a37b311f,d2250bf1-8a18-4f90-8a78-9f4b954054f7,b19d1011-dda9-4019-8073-b432a52b2d2a,3404e922-61
[...]
\ No newline at end of file
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_1000_000001_jenkins_5000.bf.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_1000_000001_jenkins_5000.bf.data
new file mode 100644
index 00000000000..da6493da86b
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_1000_000001_jenkins_5000.bf.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/////wAAABQAAABwVAAAA+gAABdwAAAABf////8AAAAUAAAAcFT3Pfz//PzVvtbeljh1nf9+f7b9qe3/2v/7/6rG/c9/+/8fu9/9a+eu//3d/3//c//u/9/mK1+Nb9OXr5q/IcU//vtfmd//+nW7n/5/3nj/r/v7rvs2m/W7f7fX/+//3fen1X3lOP3///9b6o+69/fLb/m19zf39ts7/9P+/m/lP3Pf/Z+/r/3m/2/7X5b97+t/w7nvX///r3b33//7Wet/13/muuX1/+zO/ffm1H398X+u9f8sWf/7m6X9vvc/eX9+/n52/9eVcv+3R9qXffvv//f+df7e199/0v//f/Zve+6957X8+///Vx5vv+/3v/9+hX/6/d/+/+939V/1y/223/863X////u/U///Z//dT77s//3OvWkO+SUPYvf3fv/9ff+/9X//f//Pf9ye7r+fdvj/8728/n96/3/nf/tr79/18ZHSv7r/2/fv
[...]
\ No newline at end of file
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_1000_000001_murmur_5000.bf.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_1000_000001_murmur_5000.bf.data
new file mode 100644
index 00000000000..fab404c60fe
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_1000_000001_murmur_5000.bf.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/////wAAABQBAABwVAAAA+gAABdwAAAABf////8AAAAUAQAAcFS3z8Tv92zLAWyv3s77z17x++eZ75z/v/7H+V/ffzbt5338dv/7T00nO+4W/9/ftvu93ttd/9v5/1v+e+93/nR/z2f337P//nV3v7eLy/+v/9rv/v/Wf9fr/ffvzG67/dyvmu7/jz0/5b/7+0Lx/3//v/Z0vd5+a+9//t0nkrd/+5v1vfub/ldfrtv/////f8N3v/P7b+3+u//fzXP/9/3fe3s/Wfdoz/z/3fvO+be8jv3tTK+3zv4Vxuj2vytP97+v/cu/+f8f9nL77/tnPu33z0//7/3ve+09x/9/+3XrEv6q/2///8+E2t/s1++Lf6/vz+7z7nXvfrc//nWef9u/v53d33P9t8r7/29t6/s6etns/t/d/dLnuWStf/ONxfPvmpfrc/3373P2/Ovs/fv7+z/7vf/z/uXXP/31df+9fv2/7/177/8mv273
[...]
\ No newline at end of file
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_200_000001_murmur_1000.bf.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_200_000001_murmur_1000.bf.data
new file mode 100644
index 00000000000..5b0558188aa
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/dynamic_200_000001_murmur_1000.bf.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/////wAAABQBAAAWeAAAAMgAACPwAAAABf////8AAAAUAQAAFnj////////////////////////////////////////////////////////////////////////////////////7////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////f///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[...]
\ No newline at end of file
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_10000_000001_murmur.bf.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_10000_000001_murmur.bf.data
new file mode 100644
index 00000000000..9956cd03741
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_10000_000001_murmur.bf.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/////wAAABQBAARjQCthHR1wTOuedFsiGkWvegC1jRUAPFXD7idP6jXmBHPHdBoCV5SBMSabyd5nDqPW5vC+HTgyP3HFpQl9+ZTYp4gndeA6U3R5k80nJvVVa1YonVmIpg1Syf8m7mCRfgKalIw/ZTJmXT3HzG5Ia/hDHBrkz6Q1QBv9ZSvb5LaodG5v2Ypc/1t3sVe1xrUkexp6yveV4J98uepS/+V1YBWcBUw+jx2TTYqjoG6SbZj68ozvD0+zxZReVSxoSbOYMxrwtzb/9XniD9a+M7QhHOcrnjJBru3h3jhviHqUirUzvxt9JSKczxXPK0rb4uy65YhjVQtXhgtZN8+4iN/8IQhmUB0ghIj2bSe1UGpuNiztjFoQ8huEzsBK+1FNzVZmdayRXS7uxS0HfWABJtVuHjY+ambF/UyXAuk9xT9fszGmilJxqSx/Qj+6VXCnNhLrRTNlkt8ZPGZdIwV/yHF6EgvMpbSVa/NM
[...]
\ No newline at end of file
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_1000_000001_murmur.bf.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_1000_000001_murmur.bf.data
new file mode 100644
index 00000000000..7ec1c6ec749
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_1000_000001_murmur.bf.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/////wAAABQBAABwVP///////////////////////////////////////////////////+///////////////////////////////////////////////////////////////////////////////////////////////////////////////+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[...]
\ No newline at end of file
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_200_000001_murmur.bf.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_200_000001_murmur.bf.data
new file mode 100644
index 00000000000..0e6bd376b6d
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_200_000001_murmur.bf.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/////wAAABQBAAAWeP//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[...]
\ No newline at end of file
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_5000_000001_jenkins.bf.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_5000_000001_jenkins.bf.data
new file mode 100644
index 00000000000..9ab79a06af2
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_5000_000001_jenkins.bf.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/////wAAABQAAAIxoNNJm+D7Xr/v/6/ff1d1+f5//6c+2fbVdvrv3p8/q/rftd+a9/3+9/ff9f/7//T/v3/ef/L3ybz/9t1/7///9qz/vvX7eXr18++//9178VZ93HzOvnZ/tHtb/9/fa/e2/a/2/v7/e+8/PYvP409vuPbf/f9v9874P9fP99/rm59Y39f26vxz/e/2347/x97bezz7b/O+Ss/74bf7w//scn/O+t///evz2v+/97J76l2/r77f7/n3//sl/ZBcH/r/2X/fe/KG9X7/0m29y9N/3/d/t+Y+iv9n8v//X/Xft/effOj/0//3fwsdf7/v/96Pz9nh33fqf//Kv/PN77vZo877/mrf7/7vv3t+95/f/+/79/3wvF5V/Pdvu/t3/Jv9/v//7VVtHk99/zdv9///93/d7+tZQvX/z/u95HPvH//1//T7f7fvqv7cvdft9k3nH//Nm8p3vvs//eZP6v/54+8Tb7D9
[...]
\ No newline at end of file
diff --git
a/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_5000_000001_murmur.bf.data
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_5000_000001_murmur.bf.data
new file mode 100644
index 00000000000..9a22fc3bdf1
--- /dev/null
+++
b/hudi-common/src/test/resources/format/bloom-filter/hadoop/simple_5000_000001_murmur.bf.data
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/////wAAABQBAAIxoO/v3Z983fu+9Pu+X+2v/wr/r/c2P/3b/z/f7/XufX/ffZva95+fcT6b+/5/X6vf7v+/PX92v3H1rft////59/m3fep/V355//2vN/d363b53/n9vh3S7/93/+uRf16//51/9TLnXT3H/27O7/57Hn/lz+y1yfv/5e/b5L/pfn5/27/e/1t3vdf15/e9+/p+z/e/9d99+er6//3/6Nefr/4+r53zz/rz4G+ebZr//s/vX1+3/7R+fe17z/O/O5/493///f/2j/6/+7UrfPc/nvtv7/3h3rxv+v7e6/3zv5t9p7qc353vr87f4u6758v/18vXrt9Zd++8+///56tv8X92vr7+/fe9UWt//nz9rt9e+nul79JO/1fd3d/396633T7u/f+v/e6D//dvH7Z/7v///873Lv+9/b9/+/Omml/9+a5/2r++df+/fjP7R79/1t/5/Hf/N03/y3t6d5v+/beX6/NM
[...]
\ No newline at end of file