Re: [PR] PARQUET-2361: Reduce failure rate of unit test [parquet-mr]

via GitHub Sat, 14 Oct 2023 05:42:48 -0700


amousavigourabi commented on code in PR #1170:
URL: https://github.com/apache/parquet-mr/pull/1170#discussion_r1359360435



##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp() 
throws IOException {
     GroupWriteSupport.setSchema(schema, conf);
 
     GroupFactory factory = new SimpleGroupFactory(schema);
-    for (int i = 0; i < testFpp.length; i++) {
+    for (double testFpp : testFpps) {
       File file = temp.newFile();
-      file.delete();
+      assertTrue(file.delete());
       Path path = new Path(file.getAbsolutePath());
       try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
         .withPageRowCountLimit(10)
         .withConf(conf)
         .withDictionaryEncoding(false)
         .withBloomFilterEnabled("name", true)
-        .withBloomFilterNDV("name", totalCount)
-        .withBloomFilterFPP("name", testFpp[i])
+        .withBloomFilterNDV("name", buildBloomFilterCount)
+        .withBloomFilterFPP("name", testFpp)
         .build()) {
-        java.util.Iterator<String> iterator = distinctStrings.iterator();
-        while (iterator.hasNext()) {
-          writer.write(factory.newGroup().append("name", iterator.next()));
+        for (String str : distinctStringsForFileGenerate) {
+          writer.write(factory.newGroup().append("name", str));
         }
       }
-      distinctStrings.clear();
 
       try (ParquetFileReader reader = 
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
         BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
         BloomFilter bloomFilter = 
reader.getBloomFilterDataReader(blockMetaData)
           .readBloomFilter(blockMetaData.getColumns().get(0));
 
         // The exist counts the number of times FindHash returns true.
-        int exist = 0;
-        while (distinctStrings.size() < totalCount) {
-          String str = RandomStringUtils.randomAlphabetic(randomStrLen - 2);
-          if (distinctStrings.add(str) &&
+        int falsePositive = 0;
+        Set<String> distinctStringsForProbe = new HashSet<>();
+        while (distinctStringsForProbe.size() < testBloomFilterCount) {
+          String str = RandomStringUtils.randomAlphabetic(randomStrLen - 1);
+          if (distinctStringsForProbe.add(str) &&
             
bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer())))
 {
-            exist++;
+            falsePositive++;
           }
         }
         // The exist should be less than totalCount * fpp. Add 10% here for 
error space.
-        assertTrue(exist < totalCount * (testFpp[i] * 1.1) && exist > 0);
+        int expectedFalsePositiveMaxCount = (int) 
Math.floor(testBloomFilterCount * (testFpp * 1.1));

Review Comment:
   We could avoid the cast and redundant parentheses as follows:
   
   ```suggestion
           double expectedFalsePositiveMaxCount = 
Math.floor(testBloomFilterCount * testFpp * 1.1);
   ```



##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp() 
throws IOException {
     GroupWriteSupport.setSchema(schema, conf);
 
     GroupFactory factory = new SimpleGroupFactory(schema);
-    for (int i = 0; i < testFpp.length; i++) {
+    for (double testFpp : testFpps) {
       File file = temp.newFile();
-      file.delete();
+      assertTrue(file.delete());

Review Comment:
   Do we need this assertion? AFAIK this would only trigger when there is some 
I/O failure and does not help evaluate the correctness of the code.



##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp() 
throws IOException {
     GroupWriteSupport.setSchema(schema, conf);
 
     GroupFactory factory = new SimpleGroupFactory(schema);
-    for (int i = 0; i < testFpp.length; i++) {
+    for (double testFpp : testFpps) {
       File file = temp.newFile();
-      file.delete();
+      assertTrue(file.delete());
       Path path = new Path(file.getAbsolutePath());
       try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
         .withPageRowCountLimit(10)
         .withConf(conf)
         .withDictionaryEncoding(false)
         .withBloomFilterEnabled("name", true)
-        .withBloomFilterNDV("name", totalCount)
-        .withBloomFilterFPP("name", testFpp[i])
+        .withBloomFilterNDV("name", buildBloomFilterCount)
+        .withBloomFilterFPP("name", testFpp)
         .build()) {
-        java.util.Iterator<String> iterator = distinctStrings.iterator();
-        while (iterator.hasNext()) {
-          writer.write(factory.newGroup().append("name", iterator.next()));
+        for (String str : distinctStringsForFileGenerate) {
+          writer.write(factory.newGroup().append("name", str));
         }
       }
-      distinctStrings.clear();
 
       try (ParquetFileReader reader = 
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
         BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
         BloomFilter bloomFilter = 
reader.getBloomFilterDataReader(blockMetaData)
           .readBloomFilter(blockMetaData.getColumns().get(0));
 
         // The exist counts the number of times FindHash returns true.
-        int exist = 0;
-        while (distinctStrings.size() < totalCount) {
-          String str = RandomStringUtils.randomAlphabetic(randomStrLen - 2);
-          if (distinctStrings.add(str) &&
+        int falsePositive = 0;

Review Comment:
   Could we revert the name for this back to `exist` ? I don't think these are 
necessarily _all_ false positives.



##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java:
##########
@@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp() 
throws IOException {
     GroupWriteSupport.setSchema(schema, conf);
 
     GroupFactory factory = new SimpleGroupFactory(schema);
-    for (int i = 0; i < testFpp.length; i++) {
+    for (double testFpp : testFpps) {
       File file = temp.newFile();
-      file.delete();
+      assertTrue(file.delete());
       Path path = new Path(file.getAbsolutePath());
       try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
         .withPageRowCountLimit(10)
         .withConf(conf)
         .withDictionaryEncoding(false)
         .withBloomFilterEnabled("name", true)
-        .withBloomFilterNDV("name", totalCount)
-        .withBloomFilterFPP("name", testFpp[i])
+        .withBloomFilterNDV("name", buildBloomFilterCount)
+        .withBloomFilterFPP("name", testFpp)
         .build()) {
-        java.util.Iterator<String> iterator = distinctStrings.iterator();
-        while (iterator.hasNext()) {
-          writer.write(factory.newGroup().append("name", iterator.next()));
+        for (String str : distinctStringsForFileGenerate) {
+          writer.write(factory.newGroup().append("name", str));
         }
       }
-      distinctStrings.clear();
 
       try (ParquetFileReader reader = 
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
         BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
         BloomFilter bloomFilter = 
reader.getBloomFilterDataReader(blockMetaData)
           .readBloomFilter(blockMetaData.getColumns().get(0));
 
         // The exist counts the number of times FindHash returns true.
-        int exist = 0;
-        while (distinctStrings.size() < totalCount) {
-          String str = RandomStringUtils.randomAlphabetic(randomStrLen - 2);
-          if (distinctStrings.add(str) &&
+        int falsePositive = 0;
+        Set<String> distinctStringsForProbe = new HashSet<>();
+        while (distinctStringsForProbe.size() < testBloomFilterCount) {
+          String str = RandomStringUtils.randomAlphabetic(randomStrLen - 1);
+          if (distinctStringsForProbe.add(str) &&
             
bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer())))
 {
-            exist++;
+            falsePositive++;
           }
         }
         // The exist should be less than totalCount * fpp. Add 10% here for 
error space.

Review Comment:
   Could you update the comments that refer to old variable names?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] PARQUET-2361: Reduce failure rate of unit test [parquet-mr]

Reply via email to