[
https://issues.apache.org/jira/browse/PARQUET-2157?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17554539#comment-17554539
]
ASF GitHub Bot commented on PARQUET-2157:
-----------------------------------------
ggershinsky commented on code in PR #975:
URL: https://github.com/apache/parquet-mr/pull/975#discussion_r897894894
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -282,6 +286,63 @@ public void testParquetFileWithBloomFilter() throws
IOException {
}
}
+ @Test
+ public void testParquetFileWithBloomFilterWithFpp() throws IOException {
+ int totalCount = 100000;
+ double[] testFpp = {0.005, 0.01, 0.05, 0.10, 0.15, 0.20, 0.25};
+
+ Set<String> distinctStrings = new HashSet<>();
+ while (distinctStrings.size() < totalCount) {
+ String str = RandomStringUtils.randomAlphabetic(12);
+ distinctStrings.add(str);
+ }
+
+ MessageType schema = Types.buildMessage().
+ required(BINARY).as(stringType()).named("name").named("msg");
+
+ Configuration conf = new Configuration();
+ GroupWriteSupport.setSchema(schema, conf);
+
+ GroupFactory factory = new SimpleGroupFactory(schema);
+ for (int i = 0; i < testFpp.length; i++) {
+ File file = temp.newFile();
+ file.delete();
+ Path path = new Path(file.getAbsolutePath());
+ try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+ .withPageRowCountLimit(10)
+ .withConf(conf)
+ .withDictionaryEncoding(false)
+ .withBloomFilterEnabled("name", true)
+ .withBloomFilterNDV("name", totalCount)
+ .withBloomFilterFPP("name", testFpp[i])
+ .build()) {
+ java.util.Iterator<String> iterator = distinctStrings.iterator();
+ while (iterator.hasNext()) {
+ writer.write(factory.newGroup().append("name", iterator.next()));
+ }
+ }
+ distinctStrings.clear();
+
+ try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
+ BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
+ BloomFilter bloomFilter =
reader.getBloomFilterDataReader(blockMetaData)
+ .readBloomFilter(blockMetaData.getColumns().get(0));
+
+ // The exist counts the number of times FindHash returns true.
+ int exist = 0;
+ while (distinctStrings.size() < totalCount) {
+ String str = RandomStringUtils.randomAlphabetic(10);
Review Comment:
the original values are 12 char long. Are we supposed to find a 10-char
string among them?
##########
parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java:
##########
@@ -471,6 +484,12 @@ public Builder withBloomFilterNDV(String columnPath, long
ndv) {
return this;
}
+ public Builder withBloomFilterFPP(String columnPath, double fpp) {
Review Comment:
what happens if this value is set, but the BF is not enabled? (general /
per-column)
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -282,6 +286,63 @@ public void testParquetFileWithBloomFilter() throws
IOException {
}
}
+ @Test
+ public void testParquetFileWithBloomFilterWithFpp() throws IOException {
+ int totalCount = 100000;
+ double[] testFpp = {0.005, 0.01, 0.05, 0.10, 0.15, 0.20, 0.25};
+
+ Set<String> distinctStrings = new HashSet<>();
+ while (distinctStrings.size() < totalCount) {
+ String str = RandomStringUtils.randomAlphabetic(12);
+ distinctStrings.add(str);
+ }
+
+ MessageType schema = Types.buildMessage().
+ required(BINARY).as(stringType()).named("name").named("msg");
+
+ Configuration conf = new Configuration();
+ GroupWriteSupport.setSchema(schema, conf);
+
+ GroupFactory factory = new SimpleGroupFactory(schema);
+ for (int i = 0; i < testFpp.length; i++) {
+ File file = temp.newFile();
+ file.delete();
+ Path path = new Path(file.getAbsolutePath());
+ try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+ .withPageRowCountLimit(10)
+ .withConf(conf)
+ .withDictionaryEncoding(false)
+ .withBloomFilterEnabled("name", true)
+ .withBloomFilterNDV("name", totalCount)
+ .withBloomFilterFPP("name", testFpp[i])
+ .build()) {
+ java.util.Iterator<String> iterator = distinctStrings.iterator();
+ while (iterator.hasNext()) {
+ writer.write(factory.newGroup().append("name", iterator.next()));
+ }
+ }
+ distinctStrings.clear();
+
+ try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
+ BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
+ BloomFilter bloomFilter =
reader.getBloomFilterDataReader(blockMetaData)
+ .readBloomFilter(blockMetaData.getColumns().get(0));
+
+ // The exist counts the number of times FindHash returns true.
+ int exist = 0;
+ while (distinctStrings.size() < totalCount) {
+ String str = RandomStringUtils.randomAlphabetic(10);
+ if (distinctStrings.add(str) &&
+
bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer())))
{
+ exist++;
+ }
+ }
+ // The exist should be less than totalCount * fpp. Add 10% here for
error space.
+ assertTrue(exist < totalCount * (testFpp[i] * 1.1));
Review Comment:
just curious if `totalCount` is sufficient; how often `exist` > 0?
> Add BloomFilter fpp config
> --------------------------
>
> Key: PARQUET-2157
> URL: https://issues.apache.org/jira/browse/PARQUET-2157
> Project: Parquet
> Issue Type: Improvement
> Components: parquet-mr
> Reporter: Huaxin Gao
> Priority: Major
>
> Currently parquet-mr hardcoded bloom filter fpp (false positive probability)
> to 0.01. We should have a config to let user to specify fpp.
--
This message was sent by Atlassian Jira
(v8.20.7#820007)