codope commented on code in PR #12290:
URL: https://github.com/apache/hudi/pull/12290#discussion_r1853398889
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/command/index/TestFunctionalIndex.scala:
##########
@@ -741,6 +744,93 @@ class TestFunctionalIndex extends HoodieSparkSqlTestBase {
}
}
+ @Test
+ def testBloomFiltersIndexPruning(): Unit = {
+ if (HoodieSparkUtils.gteqSpark3_3) {
+ withTempDir { tmp =>
+ Seq("cow", "mor").foreach { tableType =>
+ val tableName = generateTableName + s"_bloom_pruning_$tableType"
+ val basePath = s"${tmp.getCanonicalPath}/$tableName"
+
+ spark.sql(
+ s"""
+ CREATE TABLE $tableName (
+ | ts BIGINT,
+ | id STRING,
+ | rider STRING,
+ | driver STRING,
+ | fare DOUBLE,
+ | city STRING,
+ | state STRING
+ |) USING HUDI
+ |options(
+ | primaryKey ='id',
+ | type = '$tableType',
+ | hoodie.metadata.enable = 'true',
+ | hoodie.datasource.write.recordkey.field = 'id',
+ | hoodie.enable.data.skipping = 'true'
+ |)
+ |PARTITIONED BY (state)
+ |location '$basePath'
+ """.stripMargin)
+
+ spark.sql("set hoodie.parquet.small.file.limit=0")
+ spark.sql("set hoodie.enable.data.skipping=true")
+ spark.sql("set hoodie.metadata.enable=true")
+
+ if (HoodieSparkUtils.gteqSpark3_4) {
+ spark.sql("spark.sql.defaultColumn.enabled=false")
+ }
+
+ spark.sql(
+ s"""
+ |insert into $tableName(ts, id, rider, driver, fare, city,
state) VALUES
+ |
(1695159649,'trip1','rider-A','driver-K',19.10,'san_francisco','california'),
+ |
(1695414531,'trip6','rider-C','driver-K',17.14,'san_diego','california'),
+ |
(1695332066,'trip3','rider-E','driver-O',93.50,'austin','texas'),
+ |
(1695516137,'trip4','rider-F','driver-P',34.15,'houston','texas')
+ |""".stripMargin)
+
+ spark.sql(
+ s"""
+ |insert into $tableName(ts, id, rider, driver, fare, city,
state) VALUES
+ |
(1695091554,'trip2','rider-C','driver-M',27.70,'sunnyvale','california'),
+ |
(1699349649,'trip5','rider-A','driver-Q',3.32,'san_diego','texas')
+ |""".stripMargin)
+
+ spark.sql(s"create index idx_bloom_$tableName on $tableName using
bloom_filters(city) options(func='upper', numHashFunctions=1,
fpp=0.00000000001)")
+
+ // Pruning takes place only if query uses upper function on city
+ checkAnswer(s"select id, rider from $tableName where upper(city) in
('sunnyvale', 'sg')")()
+ checkAnswer(s"select id, rider from $tableName where lower(city) =
'sunny'")()
+ checkAnswer(s"select id, rider from $tableName where upper(city) =
'SUNNYVALE'")(
+ Seq("trip2", "rider-C")
+ )
+ checkAnswer(s"select id, rider from $tableName where city in
('san_diego', 'sunnyvale')")(
+ Seq("trip2", "rider-C"),
+ Seq("trip5", "rider-A"),
+ Seq("trip6", "rider-C")
+ )
+
+ spark.sql(s"drop index idx_bloom_$tableName on $tableName")
+
+ spark.sql(s"create index idx_bloom_$tableName on $tableName using
bloom_filters(city) options(numHashFunctions=1, fpp=0.00000000001)")
+ // Pruning takes place only if query uses no function on city
+ checkAnswer(s"select id, rider from $tableName where upper(city) in
('sunnyvale', 'sg')")()
+ checkAnswer(s"select id, rider from $tableName where lower(city) =
'sunny'")()
+ checkAnswer(s"select id, rider from $tableName where upper(city) =
'SUNNYVALE'")(
Review Comment:
The test passes but skipping is not happening. Also, if i remove `upper`
function from the query it fails. I will take a look.
<img width="1679" alt="Screenshot 2024-11-22 at 12 55 31 PM"
src="https://github.com/user-attachments/assets/5912b009-eee8-4d8d-9ef4-8ff58a40e0fd">
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]