ggershinsky commented on code in PR #975:
URL: https://github.com/apache/parquet-mr/pull/975#discussion_r897894894
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -282,6 +286,63 @@ public void testParquetFileWithBloomFilter() throws
IOException {
}
}
+ @Test
+ public void testParquetFileWithBloomFilterWithFpp() throws IOException {
+ int totalCount = 100000;
+ double[] testFpp = {0.005, 0.01, 0.05, 0.10, 0.15, 0.20, 0.25};
+
+ Set<String> distinctStrings = new HashSet<>();
+ while (distinctStrings.size() < totalCount) {
+ String str = RandomStringUtils.randomAlphabetic(12);
+ distinctStrings.add(str);
+ }
+
+ MessageType schema = Types.buildMessage().
+ required(BINARY).as(stringType()).named("name").named("msg");
+
+ Configuration conf = new Configuration();
+ GroupWriteSupport.setSchema(schema, conf);
+
+ GroupFactory factory = new SimpleGroupFactory(schema);
+ for (int i = 0; i < testFpp.length; i++) {
+ File file = temp.newFile();
+ file.delete();
+ Path path = new Path(file.getAbsolutePath());
+ try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+ .withPageRowCountLimit(10)
+ .withConf(conf)
+ .withDictionaryEncoding(false)
+ .withBloomFilterEnabled("name", true)
+ .withBloomFilterNDV("name", totalCount)
+ .withBloomFilterFPP("name", testFpp[i])
+ .build()) {
+ java.util.Iterator<String> iterator = distinctStrings.iterator();
+ while (iterator.hasNext()) {
+ writer.write(factory.newGroup().append("name", iterator.next()));
+ }
+ }
+ distinctStrings.clear();
+
+ try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
+ BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
+ BloomFilter bloomFilter =
reader.getBloomFilterDataReader(blockMetaData)
+ .readBloomFilter(blockMetaData.getColumns().get(0));
+
+ // The exist counts the number of times FindHash returns true.
+ int exist = 0;
+ while (distinctStrings.size() < totalCount) {
+ String str = RandomStringUtils.randomAlphabetic(10);
Review Comment:
the original values are 12 char long. Are we supposed to find a 10-char
string among them?
##########
parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java:
##########
@@ -471,6 +484,12 @@ public Builder withBloomFilterNDV(String columnPath, long
ndv) {
return this;
}
+ public Builder withBloomFilterFPP(String columnPath, double fpp) {
Review Comment:
what happens if this value is set, but the BF is not enabled? (general /
per-column)
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -282,6 +286,63 @@ public void testParquetFileWithBloomFilter() throws
IOException {
}
}
+ @Test
+ public void testParquetFileWithBloomFilterWithFpp() throws IOException {
+ int totalCount = 100000;
+ double[] testFpp = {0.005, 0.01, 0.05, 0.10, 0.15, 0.20, 0.25};
+
+ Set<String> distinctStrings = new HashSet<>();
+ while (distinctStrings.size() < totalCount) {
+ String str = RandomStringUtils.randomAlphabetic(12);
+ distinctStrings.add(str);
+ }
+
+ MessageType schema = Types.buildMessage().
+ required(BINARY).as(stringType()).named("name").named("msg");
+
+ Configuration conf = new Configuration();
+ GroupWriteSupport.setSchema(schema, conf);
+
+ GroupFactory factory = new SimpleGroupFactory(schema);
+ for (int i = 0; i < testFpp.length; i++) {
+ File file = temp.newFile();
+ file.delete();
+ Path path = new Path(file.getAbsolutePath());
+ try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+ .withPageRowCountLimit(10)
+ .withConf(conf)
+ .withDictionaryEncoding(false)
+ .withBloomFilterEnabled("name", true)
+ .withBloomFilterNDV("name", totalCount)
+ .withBloomFilterFPP("name", testFpp[i])
+ .build()) {
+ java.util.Iterator<String> iterator = distinctStrings.iterator();
+ while (iterator.hasNext()) {
+ writer.write(factory.newGroup().append("name", iterator.next()));
+ }
+ }
+ distinctStrings.clear();
+
+ try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
+ BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
+ BloomFilter bloomFilter =
reader.getBloomFilterDataReader(blockMetaData)
+ .readBloomFilter(blockMetaData.getColumns().get(0));
+
+ // The exist counts the number of times FindHash returns true.
+ int exist = 0;
+ while (distinctStrings.size() < totalCount) {
+ String str = RandomStringUtils.randomAlphabetic(10);
+ if (distinctStrings.add(str) &&
+
bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer())))
{
+ exist++;
+ }
+ }
+ // The exist should be less than totalCount * fpp. Add 10% here for
error space.
+ assertTrue(exist < totalCount * (testFpp[i] * 1.1));
Review Comment:
just curious if `totalCount` is sufficient; how often `exist` > 0?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]