yihua commented on code in PR #9866:
URL: https://github.com/apache/hudi/pull/9866#discussion_r1361376081
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala:
##########
@@ -1968,6 +1968,110 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
})
}
+ test("Test multiple partition fields pruning") {
+
+ withRecordType()(withTempDir { tmp =>
+ val targetTable = generateTableName
+ spark.sql(
+ s"""
+ |create table ${targetTable} (
+ | `id` string,
+ | `name` string,
+ | `dt` bigint,
+ | `day` STRING,
+ | `hour` INT
+ |) using hudi
+ |tblproperties (
+ | 'primaryKey' = 'id',
+ | 'type' = 'mor',
+ | 'preCombineField'='dt',
+ | 'hoodie.index.type' = 'BUCKET',
+ | 'hoodie.bucket.index.hash.field' = 'id',
+ | 'hoodie.bucket.index.num.buckets'=512
+ | )
+ |partitioned by (`day`,`hour`)
+ |location '${tmp.getCanonicalPath}/$targetTable'
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
10 as `hour`
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
11 as `hour`
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
12 as `hour`
+ |""".stripMargin)
+ val df = spark.sql(
+ s"""
+ |select * from ${targetTable} where day='2023-10-12' and hour=11;
+ |""".stripMargin)
+ var rdd_head = df.rdd
Review Comment:
```suggestion
var rddHead = df.rdd
```
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala:
##########
@@ -1968,6 +1968,110 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
})
}
+ test("Test multiple partition fields pruning") {
+
+ withRecordType()(withTempDir { tmp =>
+ val targetTable = generateTableName
+ spark.sql(
+ s"""
+ |create table ${targetTable} (
+ | `id` string,
+ | `name` string,
+ | `dt` bigint,
+ | `day` STRING,
+ | `hour` INT
+ |) using hudi
+ |tblproperties (
+ | 'primaryKey' = 'id',
+ | 'type' = 'mor',
+ | 'preCombineField'='dt',
+ | 'hoodie.index.type' = 'BUCKET',
+ | 'hoodie.bucket.index.hash.field' = 'id',
+ | 'hoodie.bucket.index.num.buckets'=512
+ | )
+ |partitioned by (`day`,`hour`)
+ |location '${tmp.getCanonicalPath}/$targetTable'
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
10 as `hour`
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
11 as `hour`
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
12 as `hour`
+ |""".stripMargin)
Review Comment:
nit: is it possible to consolidate these statements into a single SQL with
CTAS so only one Hudi write transaction is done to reduce test runtime?
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala:
##########
@@ -1968,6 +1968,110 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
})
}
+ test("Test multiple partition fields pruning") {
+
+ withRecordType()(withTempDir { tmp =>
+ val targetTable = generateTableName
+ spark.sql(
+ s"""
+ |create table ${targetTable} (
+ | `id` string,
+ | `name` string,
+ | `dt` bigint,
+ | `day` STRING,
+ | `hour` INT
+ |) using hudi
+ |tblproperties (
+ | 'primaryKey' = 'id',
+ | 'type' = 'mor',
+ | 'preCombineField'='dt',
+ | 'hoodie.index.type' = 'BUCKET',
+ | 'hoodie.bucket.index.hash.field' = 'id',
+ | 'hoodie.bucket.index.num.buckets'=512
+ | )
+ |partitioned by (`day`,`hour`)
+ |location '${tmp.getCanonicalPath}/$targetTable'
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
10 as `hour`
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
11 as `hour`
+ |""".stripMargin)
+ spark.sql(
+ s"""
+ |insert into ${targetTable}
+ |select '1' as id, 'aa' as name, 123 as dt, '2023-10-12' as `day`,
12 as `hour`
+ |""".stripMargin)
+ val df = spark.sql(
+ s"""
+ |select * from ${targetTable} where day='2023-10-12' and hour=11;
+ |""".stripMargin)
+ var rdd_head = df.rdd
+ while (rdd_head.dependencies.size > 0) {
+ assertResult(1)(rdd_head.partitions.size)
+ rdd_head = rdd_head.firstParent
+ }
+ assertResult(1)(rdd_head.partitions.size)
Review Comment:
Does this validation guarantee that multiple table partitions are not put
into the same RDD partition?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]