[GitHub] [spark] LuciferYang commented on a diff in pull request #42414: [SPARK-42664][CONNECT] Support `bloomFilter` function for `DataFrameStatFunctions`

via GitHub Wed, 09 Aug 2023 22:13:51 -0700


LuciferYang commented on code in PR #42414:
URL: https://github.com/apache/spark/pull/42414#discussion_r1289558335



##########
connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientDataFrameStatSuite.scala:
##########
@@ -176,4 +176,91 @@ class ClientDataFrameStatSuite extends RemoteSparkSession {
     assert(sketch.relativeError() === 0.001)
     assert(sketch.confidence() === 0.99 +- 5e-3)
   }
+
+  test("Bloom filter -- Long Column") {
+    val session = spark
+    import session.implicits._
+    val data = Seq(-143, -32, -5, 1, 17, 39, 43, 101, 127, 997).map(_.toLong)
+    val df = data.toDF("id")
+    val negativeValues = Seq(-11, 1021, 32767).map(_.toLong)
+    checkBloomFilter(data, negativeValues, df)
+  }
+
+  test("Bloom filter -- Int Column") {
+    val session = spark
+    import session.implicits._
+    val data = Seq(-143, -32, -5, 1, 17, 39, 43, 101, 127, 997)
+    val df = data.toDF("id")
+    val negativeValues = Seq(-11, 1021, 32767)
+    checkBloomFilter(data, negativeValues, df)
+  }
+
+  test("Bloom filter -- Short Column") {
+    val session = spark
+    import session.implicits._
+    val data = Seq(-143, -32, -5, 1, 17, 39, 43, 101, 127, 997).map(_.toShort)
+    val df = data.toDF("id")
+    val negativeValues = Seq(-11, 1021, 32767).map(_.toShort)
+    checkBloomFilter(data, negativeValues, df)
+  }
+
+  test("Bloom filter -- Byte Column") {
+    val session = spark
+    import session.implicits._
+    val data = Seq(-32, -5, 1, 17, 39, 43, 101, 127).map(_.toByte)
+    val df = data.toDF("id")
+    val negativeValues = Seq(-101, 55, 113).map(_.toByte)
+    checkBloomFilter(data, negativeValues, df)
+  }
+
+  test("Bloom filter -- String Column") {
+    val session = spark
+    import session.implicits._
+    val data = Seq(-143, -32, -5, 1, 17, 39, 43, 101, 127, 997).map(_.toString)
+    val df = data.toDF("id")
+    val negativeValues = Seq(-11, 1021, 32767).map(_.toString)
+    checkBloomFilter(data, negativeValues, df)
+  }
+
+  private def checkBloomFilter(
+      data: Seq[Any],
+      notContainValues: Seq[Any],
+      df: DataFrame): Unit = {
+    val filter1 = df.stat.bloomFilter("id", 1000, 0.03)
+    assert(filter1.expectedFpp() - 0.03 < 1e-3)
+    assert(data.forall(filter1.mightContain))
+    assert(notContainValues.forall(n => !filter1.mightContain(n)))

Review Comment:
   Added checks for values that are definitely not included.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] LuciferYang commented on a diff in pull request #42414: [SPARK-42664][CONNECT] Support `bloomFilter` function for `DataFrameStatFunctions`

Reply via email to