[
https://issues.apache.org/jira/browse/PARQUET-2361?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17775214#comment-17775214
]
ASF GitHub Bot commented on PARQUET-2361:
-----------------------------------------
amousavigourabi commented on code in PR #1170:
URL: https://github.com/apache/parquet-mr/pull/1170#discussion_r1359360435
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp()
throws IOException {
GroupWriteSupport.setSchema(schema, conf);
GroupFactory factory = new SimpleGroupFactory(schema);
- for (int i = 0; i < testFpp.length; i++) {
+ for (double testFpp : testFpps) {
File file = temp.newFile();
- file.delete();
+ assertTrue(file.delete());
Path path = new Path(file.getAbsolutePath());
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
.withPageRowCountLimit(10)
.withConf(conf)
.withDictionaryEncoding(false)
.withBloomFilterEnabled("name", true)
- .withBloomFilterNDV("name", totalCount)
- .withBloomFilterFPP("name", testFpp[i])
+ .withBloomFilterNDV("name", buildBloomFilterCount)
+ .withBloomFilterFPP("name", testFpp)
.build()) {
- java.util.Iterator<String> iterator = distinctStrings.iterator();
- while (iterator.hasNext()) {
- writer.write(factory.newGroup().append("name", iterator.next()));
+ for (String str : distinctStringsForFileGenerate) {
+ writer.write(factory.newGroup().append("name", str));
}
}
- distinctStrings.clear();
try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
BloomFilter bloomFilter =
reader.getBloomFilterDataReader(blockMetaData)
.readBloomFilter(blockMetaData.getColumns().get(0));
// The exist counts the number of times FindHash returns true.
- int exist = 0;
- while (distinctStrings.size() < totalCount) {
- String str = RandomStringUtils.randomAlphabetic(randomStrLen - 2);
- if (distinctStrings.add(str) &&
+ int falsePositive = 0;
+ Set<String> distinctStringsForProbe = new HashSet<>();
+ while (distinctStringsForProbe.size() < testBloomFilterCount) {
+ String str = RandomStringUtils.randomAlphabetic(randomStrLen - 1);
+ if (distinctStringsForProbe.add(str) &&
bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer())))
{
- exist++;
+ falsePositive++;
}
}
// The exist should be less than totalCount * fpp. Add 10% here for
error space.
- assertTrue(exist < totalCount * (testFpp[i] * 1.1) && exist > 0);
+ int expectedFalsePositiveMaxCount = (int)
Math.floor(testBloomFilterCount * (testFpp * 1.1));
Review Comment:
We could avoid the cast and redundant parentheses as follows:
```suggestion
double expectedFalsePositiveMaxCount =
Math.floor(testBloomFilterCount * testFpp * 1.1);
```
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp()
throws IOException {
GroupWriteSupport.setSchema(schema, conf);
GroupFactory factory = new SimpleGroupFactory(schema);
- for (int i = 0; i < testFpp.length; i++) {
+ for (double testFpp : testFpps) {
File file = temp.newFile();
- file.delete();
+ assertTrue(file.delete());
Review Comment:
Do we need this assertion? AFAIK this would only trigger when there is some
I/O failure and does not help evaluate the correctness of the code.
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp()
throws IOException {
GroupWriteSupport.setSchema(schema, conf);
GroupFactory factory = new SimpleGroupFactory(schema);
- for (int i = 0; i < testFpp.length; i++) {
+ for (double testFpp : testFpps) {
File file = temp.newFile();
- file.delete();
+ assertTrue(file.delete());
Path path = new Path(file.getAbsolutePath());
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
.withPageRowCountLimit(10)
.withConf(conf)
.withDictionaryEncoding(false)
.withBloomFilterEnabled("name", true)
- .withBloomFilterNDV("name", totalCount)
- .withBloomFilterFPP("name", testFpp[i])
+ .withBloomFilterNDV("name", buildBloomFilterCount)
+ .withBloomFilterFPP("name", testFpp)
.build()) {
- java.util.Iterator<String> iterator = distinctStrings.iterator();
- while (iterator.hasNext()) {
- writer.write(factory.newGroup().append("name", iterator.next()));
+ for (String str : distinctStringsForFileGenerate) {
+ writer.write(factory.newGroup().append("name", str));
}
}
- distinctStrings.clear();
try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
BloomFilter bloomFilter =
reader.getBloomFilterDataReader(blockMetaData)
.readBloomFilter(blockMetaData.getColumns().get(0));
// The exist counts the number of times FindHash returns true.
- int exist = 0;
- while (distinctStrings.size() < totalCount) {
- String str = RandomStringUtils.randomAlphabetic(randomStrLen - 2);
- if (distinctStrings.add(str) &&
+ int falsePositive = 0;
Review Comment:
Could we revert the name for this back to `exist` ? I don't think these are
necessarily _all_ false positives.
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp()
throws IOException {
GroupWriteSupport.setSchema(schema, conf);
GroupFactory factory = new SimpleGroupFactory(schema);
- for (int i = 0; i < testFpp.length; i++) {
+ for (double testFpp : testFpps) {
File file = temp.newFile();
- file.delete();
+ assertTrue(file.delete());
Path path = new Path(file.getAbsolutePath());
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
.withPageRowCountLimit(10)
.withConf(conf)
.withDictionaryEncoding(false)
.withBloomFilterEnabled("name", true)
- .withBloomFilterNDV("name", totalCount)
- .withBloomFilterFPP("name", testFpp[i])
+ .withBloomFilterNDV("name", buildBloomFilterCount)
+ .withBloomFilterFPP("name", testFpp)
.build()) {
- java.util.Iterator<String> iterator = distinctStrings.iterator();
- while (iterator.hasNext()) {
- writer.write(factory.newGroup().append("name", iterator.next()));
+ for (String str : distinctStringsForFileGenerate) {
+ writer.write(factory.newGroup().append("name", str));
}
}
- distinctStrings.clear();
try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
BloomFilter bloomFilter =
reader.getBloomFilterDataReader(blockMetaData)
.readBloomFilter(blockMetaData.getColumns().get(0));
// The exist counts the number of times FindHash returns true.
- int exist = 0;
- while (distinctStrings.size() < totalCount) {
- String str = RandomStringUtils.randomAlphabetic(randomStrLen - 2);
- if (distinctStrings.add(str) &&
+ int falsePositive = 0;
+ Set<String> distinctStringsForProbe = new HashSet<>();
+ while (distinctStringsForProbe.size() < testBloomFilterCount) {
+ String str = RandomStringUtils.randomAlphabetic(randomStrLen - 1);
+ if (distinctStringsForProbe.add(str) &&
bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer())))
{
- exist++;
+ falsePositive++;
}
}
// The exist should be less than totalCount * fpp. Add 10% here for
error space.
Review Comment:
Could you update the comments that refer to old variable names?
> Reduce failure rate of unit test testParquetFileWithBloomFilterWithFpp
> ----------------------------------------------------------------------
>
> Key: PARQUET-2361
> URL: https://issues.apache.org/jira/browse/PARQUET-2361
> Project: Parquet
> Issue Type: Test
> Components: parquet-mr
> Affects Versions: 1.13.2
> Reporter: Feng Jiajie
> Priority: Major
>
> {code:java}
> [INFO] Results:
> [INFO]
> Error: Failures:
> Error: TestParquetWriter.testParquetFileWithBloomFilterWithFpp:342
> [INFO] {code}
> The unit test utilizes random string generation for test data without using a
> fixed seed. The expectation of a unit test is that the number of false
> positives in the Bloom filter should match the set probability. Therefore, a
> simple fix is to increase the number of tests on the Bloom filter. The reason
> for not using a fixed seed with random numbers is to avoid making the tests
> effective only in specific scenarios. If it is necessary to use a fixed seed,
> I can also modify the PR accordingly.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)