[ https://issues.apache.org/jira/browse/PARQUET-2157?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17554539#comment-17554539 ]
ASF GitHub Bot commented on PARQUET-2157: ----------------------------------------- ggershinsky commented on code in PR #975: URL: https://github.com/apache/parquet-mr/pull/975#discussion_r897894894 ########## parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java: ########## @@ -282,6 +286,63 @@ public void testParquetFileWithBloomFilter() throws IOException { } } + @Test + public void testParquetFileWithBloomFilterWithFpp() throws IOException { + int totalCount = 100000; + double[] testFpp = {0.005, 0.01, 0.05, 0.10, 0.15, 0.20, 0.25}; + + Set<String> distinctStrings = new HashSet<>(); + while (distinctStrings.size() < totalCount) { + String str = RandomStringUtils.randomAlphabetic(12); + distinctStrings.add(str); + } + + MessageType schema = Types.buildMessage(). + required(BINARY).as(stringType()).named("name").named("msg"); + + Configuration conf = new Configuration(); + GroupWriteSupport.setSchema(schema, conf); + + GroupFactory factory = new SimpleGroupFactory(schema); + for (int i = 0; i < testFpp.length; i++) { + File file = temp.newFile(); + file.delete(); + Path path = new Path(file.getAbsolutePath()); + try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path) + .withPageRowCountLimit(10) + .withConf(conf) + .withDictionaryEncoding(false) + .withBloomFilterEnabled("name", true) + .withBloomFilterNDV("name", totalCount) + .withBloomFilterFPP("name", testFpp[i]) + .build()) { + java.util.Iterator<String> iterator = distinctStrings.iterator(); + while (iterator.hasNext()) { + writer.write(factory.newGroup().append("name", iterator.next())); + } + } + distinctStrings.clear(); + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0); + BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData) + .readBloomFilter(blockMetaData.getColumns().get(0)); + + // The exist counts the number of times FindHash returns true. + int exist = 0; + while (distinctStrings.size() < totalCount) { + String str = RandomStringUtils.randomAlphabetic(10); Review Comment: the original values are 12 char long. Are we supposed to find a 10-char string among them? ########## parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java: ########## @@ -471,6 +484,12 @@ public Builder withBloomFilterNDV(String columnPath, long ndv) { return this; } + public Builder withBloomFilterFPP(String columnPath, double fpp) { Review Comment: what happens if this value is set, but the BF is not enabled? (general / per-column) ########## parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java: ########## @@ -282,6 +286,63 @@ public void testParquetFileWithBloomFilter() throws IOException { } } + @Test + public void testParquetFileWithBloomFilterWithFpp() throws IOException { + int totalCount = 100000; + double[] testFpp = {0.005, 0.01, 0.05, 0.10, 0.15, 0.20, 0.25}; + + Set<String> distinctStrings = new HashSet<>(); + while (distinctStrings.size() < totalCount) { + String str = RandomStringUtils.randomAlphabetic(12); + distinctStrings.add(str); + } + + MessageType schema = Types.buildMessage(). + required(BINARY).as(stringType()).named("name").named("msg"); + + Configuration conf = new Configuration(); + GroupWriteSupport.setSchema(schema, conf); + + GroupFactory factory = new SimpleGroupFactory(schema); + for (int i = 0; i < testFpp.length; i++) { + File file = temp.newFile(); + file.delete(); + Path path = new Path(file.getAbsolutePath()); + try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path) + .withPageRowCountLimit(10) + .withConf(conf) + .withDictionaryEncoding(false) + .withBloomFilterEnabled("name", true) + .withBloomFilterNDV("name", totalCount) + .withBloomFilterFPP("name", testFpp[i]) + .build()) { + java.util.Iterator<String> iterator = distinctStrings.iterator(); + while (iterator.hasNext()) { + writer.write(factory.newGroup().append("name", iterator.next())); + } + } + distinctStrings.clear(); + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0); + BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData) + .readBloomFilter(blockMetaData.getColumns().get(0)); + + // The exist counts the number of times FindHash returns true. + int exist = 0; + while (distinctStrings.size() < totalCount) { + String str = RandomStringUtils.randomAlphabetic(10); + if (distinctStrings.add(str) && + bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer()))) { + exist++; + } + } + // The exist should be less than totalCount * fpp. Add 10% here for error space. + assertTrue(exist < totalCount * (testFpp[i] * 1.1)); Review Comment: just curious if `totalCount` is sufficient; how often `exist` > 0? > Add BloomFilter fpp config > -------------------------- > > Key: PARQUET-2157 > URL: https://issues.apache.org/jira/browse/PARQUET-2157 > Project: Parquet > Issue Type: Improvement > Components: parquet-mr > Reporter: Huaxin Gao > Priority: Major > > Currently parquet-mr hardcoded bloom filter fpp (false positive probability) > to 0.01. We should have a config to let user to specify fpp. -- This message was sent by Atlassian Jira (v8.20.7#820007)