codope commented on code in PR #12290:
URL: https://github.com/apache/hudi/pull/12290#discussion_r1852654736


##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestFunctionalIndex.scala:
##########
@@ -741,6 +744,93 @@ class TestFunctionalIndex extends HoodieSparkSqlTestBase {
     }
   }
 
+  @Test
+  def testBloomFiltersIndexPruning(): Unit = {
+    if (HoodieSparkUtils.gteqSpark3_3) {
+      withTempDir { tmp =>
+        Seq("cow", "mor").foreach { tableType =>
+          val tableName = generateTableName + s"_bloom_pruning_$tableType"
+          val basePath = s"${tmp.getCanonicalPath}/$tableName"
+
+          spark.sql(
+            s"""
+           CREATE TABLE $tableName (
+               |    ts BIGINT,
+               |    id STRING,
+               |    rider STRING,
+               |    driver STRING,
+               |    fare DOUBLE,
+               |    city STRING,
+               |    state STRING
+               |) USING HUDI
+               |options(
+               |    primaryKey ='id',
+               |    type = '$tableType',
+               |    hoodie.metadata.enable = 'true',
+               |    hoodie.datasource.write.recordkey.field = 'id',
+               |    hoodie.enable.data.skipping = 'true'
+               |)
+               |PARTITIONED BY (state)
+               |location '$basePath'
+       """.stripMargin)
+
+          spark.sql("set hoodie.parquet.small.file.limit=0")
+          spark.sql("set hoodie.enable.data.skipping=true")
+          spark.sql("set hoodie.metadata.enable=true")
+
+          if (HoodieSparkUtils.gteqSpark3_4) {
+            spark.sql("spark.sql.defaultColumn.enabled=false")
+          }
+
+          spark.sql(
+            s"""
+               |insert into $tableName(ts, id, rider, driver, fare, city, 
state) VALUES
+               |  
(1695159649,'trip1','rider-A','driver-K',19.10,'san_francisco','california'),
+               |  
(1695414531,'trip6','rider-C','driver-K',17.14,'san_diego','california'),
+               |  
(1695332066,'trip3','rider-E','driver-O',93.50,'austin','texas'),
+               |  
(1695516137,'trip4','rider-F','driver-P',34.15,'houston','texas')
+               |""".stripMargin)
+
+          spark.sql(
+            s"""
+               |insert into $tableName(ts, id, rider, driver, fare, city, 
state) VALUES
+               |  
(1695091554,'trip2','rider-C','driver-M',27.70,'sunnyvale','california'),
+               |  
(1699349649,'trip5','rider-A','driver-Q',3.32,'san_diego','texas')
+               |""".stripMargin)
+
+          spark.sql(s"create index idx_bloom_$tableName on $tableName using 
bloom_filters(city) options(func='upper', numHashFunctions=1, 
fpp=0.00000000001)")
+
+          // Pruning takes place only if query uses upper function on city
+          checkAnswer(s"select id, rider from $tableName where upper(city) in 
('sunnyvale', 'sg')")()
+          checkAnswer(s"select id, rider from $tableName where lower(city) = 
'sunny'")()
+          checkAnswer(s"select id, rider from $tableName where upper(city) = 
'SUNNYVALE'")(
+            Seq("trip2", "rider-C")
+          )
+          checkAnswer(s"select id, rider from $tableName where city in 
('san_diego', 'sunnyvale')")(
+            Seq("trip2", "rider-C"),
+            Seq("trip5", "rider-A"),
+            Seq("trip6", "rider-C")
+          )
+
+          spark.sql(s"drop index idx_bloom_$tableName on $tableName")
+
+          spark.sql(s"create index idx_bloom_$tableName on $tableName using 
bloom_filters(city) options(numHashFunctions=1, fpp=0.00000000001)")
+          // Pruning takes place only if query uses no function on city
+          checkAnswer(s"select id, rider from $tableName where upper(city) in 
('sunnyvale', 'sg')")()
+          checkAnswer(s"select id, rider from $tableName where lower(city) = 
'sunny'")()
+          checkAnswer(s"select id, rider from $tableName where upper(city) = 
'SUNNYVALE'")(
+            Seq("trip2", "rider-C")
+          )
+          checkAnswer(s"select id, rider from $tableName where city in 
('san_diego', 'sunnyvale')")(

Review Comment:
   should we add `verifyQueryPredicate` like functionality here also?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to