codope commented on code in PR #12290:
URL: https://github.com/apache/hudi/pull/12290#discussion_r1853489260


##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestFunctionalIndex.scala:
##########
@@ -741,6 +744,93 @@ class TestFunctionalIndex extends HoodieSparkSqlTestBase {
     }
   }
 
+  @Test
+  def testBloomFiltersIndexPruning(): Unit = {
+    if (HoodieSparkUtils.gteqSpark3_3) {
+      withTempDir { tmp =>
+        Seq("cow", "mor").foreach { tableType =>
+          val tableName = generateTableName + s"_bloom_pruning_$tableType"
+          val basePath = s"${tmp.getCanonicalPath}/$tableName"
+
+          spark.sql(
+            s"""
+           CREATE TABLE $tableName (
+               |    ts BIGINT,
+               |    id STRING,
+               |    rider STRING,
+               |    driver STRING,
+               |    fare DOUBLE,
+               |    city STRING,
+               |    state STRING
+               |) USING HUDI
+               |options(
+               |    primaryKey ='id',
+               |    type = '$tableType',
+               |    hoodie.metadata.enable = 'true',
+               |    hoodie.datasource.write.recordkey.field = 'id',
+               |    hoodie.enable.data.skipping = 'true'
+               |)
+               |PARTITIONED BY (state)
+               |location '$basePath'
+       """.stripMargin)
+
+          spark.sql("set hoodie.parquet.small.file.limit=0")
+          spark.sql("set hoodie.enable.data.skipping=true")
+          spark.sql("set hoodie.metadata.enable=true")
+
+          if (HoodieSparkUtils.gteqSpark3_4) {
+            spark.sql("spark.sql.defaultColumn.enabled=false")
+          }
+
+          spark.sql(
+            s"""
+               |insert into $tableName(ts, id, rider, driver, fare, city, 
state) VALUES
+               |  
(1695159649,'trip1','rider-A','driver-K',19.10,'san_francisco','california'),
+               |  
(1695414531,'trip6','rider-C','driver-K',17.14,'san_diego','california'),
+               |  
(1695332066,'trip3','rider-E','driver-O',93.50,'austin','texas'),
+               |  
(1695516137,'trip4','rider-F','driver-P',34.15,'houston','texas')
+               |""".stripMargin)
+
+          spark.sql(
+            s"""
+               |insert into $tableName(ts, id, rider, driver, fare, city, 
state) VALUES
+               |  
(1695091554,'trip2','rider-C','driver-M',27.70,'sunnyvale','california'),
+               |  
(1699349649,'trip5','rider-A','driver-Q',3.32,'san_diego','texas')
+               |""".stripMargin)
+
+          spark.sql(s"create index idx_bloom_$tableName on $tableName using 
bloom_filters(city) options(func='upper', numHashFunctions=1, 
fpp=0.00000000001)")
+
+          // Pruning takes place only if query uses upper function on city
+          checkAnswer(s"select id, rider from $tableName where upper(city) in 
('sunnyvale', 'sg')")()
+          checkAnswer(s"select id, rider from $tableName where lower(city) = 
'sunny'")()
+          checkAnswer(s"select id, rider from $tableName where upper(city) = 
'SUNNYVALE'")(
+            Seq("trip2", "rider-C")
+          )
+          checkAnswer(s"select id, rider from $tableName where city in 
('san_diego', 'sunnyvale')")(
+            Seq("trip2", "rider-C"),
+            Seq("trip5", "rider-A"),
+            Seq("trip6", "rider-C")
+          )
+
+          spark.sql(s"drop index idx_bloom_$tableName on $tableName")
+
+          spark.sql(s"create index idx_bloom_$tableName on $tableName using 
bloom_filters(city) options(numHashFunctions=1, fpp=0.00000000001)")
+          // Pruning takes place only if query uses no function on city
+          checkAnswer(s"select id, rider from $tableName where upper(city) in 
('sunnyvale', 'sg')")()
+          checkAnswer(s"select id, rider from $tableName where lower(city) = 
'sunny'")()
+          checkAnswer(s"select id, rider from $tableName where upper(city) = 
'SUNNYVALE'")(
+            Seq("trip2", "rider-C")
+          )
+          checkAnswer(s"select id, rider from $tableName where city in 
('san_diego', 'sunnyvale')")(

Review Comment:
   added in my latest commit.. verifying both predicates w/ and w/o function



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to