[parquet-mr] branch master updated: PARQUET-2361: Reduce failure rate of unit test (#1170)

gangwu Wed, 18 Oct 2023 19:30:20 -0700

This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git



The following commit(s) were added to refs/heads/master by this push:
     new 354ddebe2 PARQUET-2361: Reduce failure rate of unit test (#1170)
354ddebe2 is described below

commit 354ddebe2f82229b1dbcd73cecc9b5af4fe816d8
Author: fengjiajie <[email protected]>
AuthorDate: Thu Oct 19 10:29:33 2023 +0800

    PARQUET-2361: Reduce failure rate of unit test (#1170)
---
 .../apache/parquet/hadoop/TestParquetWriter.java   | 41 +++++++++++-----------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git 
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java 
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java
index b404b4fba..e97164cb7 100644
--- 
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java
+++ 
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java
@@ -288,14 +288,15 @@ public class TestParquetWriter {
 
   @Test
   public void testParquetFileWithBloomFilterWithFpp() throws IOException {
-    int totalCount = 100000;
-    double[] testFpp = {0.01, 0.05, 0.10, 0.15, 0.20, 0.25};
+    int buildBloomFilterCount = 100000;
+    double[] testFpps = {0.01, 0.05, 0.10, 0.15, 0.20, 0.25};
     int randomStrLen = 12;
+    final int testBloomFilterCount = 200000;
 
-    Set<String> distinctStrings = new HashSet<>();
-    while (distinctStrings.size() < totalCount) {
+    Set<String> distinctStringsForFileGenerate = new HashSet<>();
+    while (distinctStringsForFileGenerate.size() < buildBloomFilterCount) {
       String str = RandomStringUtils.randomAlphabetic(randomStrLen);
-      distinctStrings.add(str);
+      distinctStringsForFileGenerate.add(str);
     }
 
     MessageType schema = Types.buildMessage().
@@ -305,7 +306,7 @@ public class TestParquetWriter {
     GroupWriteSupport.setSchema(schema, conf);
 
     GroupFactory factory = new SimpleGroupFactory(schema);
-    for (int i = 0; i < testFpp.length; i++) {
+    for (double testFpp : testFpps) {
       File file = temp.newFile();
       file.delete();
       Path path = new Path(file.getAbsolutePath());
@@ -314,32 +315,32 @@ public class TestParquetWriter {
         .withConf(conf)
         .withDictionaryEncoding(false)
         .withBloomFilterEnabled("name", true)
-        .withBloomFilterNDV("name", totalCount)
-        .withBloomFilterFPP("name", testFpp[i])
+        .withBloomFilterNDV("name", buildBloomFilterCount)
+        .withBloomFilterFPP("name", testFpp)
         .build()) {
-        java.util.Iterator<String> iterator = distinctStrings.iterator();
-        while (iterator.hasNext()) {
-          writer.write(factory.newGroup().append("name", iterator.next()));
+        for (String str : distinctStringsForFileGenerate) {
+          writer.write(factory.newGroup().append("name", str));
         }
       }
-      distinctStrings.clear();
 
       try (ParquetFileReader reader = 
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
         BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
         BloomFilter bloomFilter = 
reader.getBloomFilterDataReader(blockMetaData)
           .readBloomFilter(blockMetaData.getColumns().get(0));
 
-        // The exist counts the number of times FindHash returns true.
-        int exist = 0;
-        while (distinctStrings.size() < totalCount) {
-          String str = RandomStringUtils.randomAlphabetic(randomStrLen - 2);
-          if (distinctStrings.add(str) &&
+        // The false positive counts the number of times FindHash returns true.
+        int falsePositive = 0;
+        Set<String> distinctStringsForProbe = new HashSet<>();
+        while (distinctStringsForProbe.size() < testBloomFilterCount) {
+          String str = RandomStringUtils.randomAlphabetic(randomStrLen - 1);
+          if (distinctStringsForProbe.add(str) &&
             
bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer())))
 {
-            exist++;
+            falsePositive++;
           }
         }
-        // The exist should be less than totalCount * fpp. Add 10% here for 
error space.
-        assertTrue(exist < totalCount * (testFpp[i] * 1.1) && exist > 0);
+        // The false positive should be less than totalCount * fpp. Add 15% 
here for error space.
+        double expectedFalsePositiveMaxCount = Math.floor(testBloomFilterCount 
* (testFpp * 1.15));
+        assertTrue(falsePositive < expectedFalsePositiveMaxCount && 
falsePositive > 0);
       }
     }
   }

[parquet-mr] branch master updated: PARQUET-2361: Reduce failure rate of unit test (#1170)

Reply via email to