nsivabalan commented on code in PR #9131:
URL: https://github.com/apache/hudi/pull/9131#discussion_r1256773354
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala:
##########
@@ -605,6 +605,18 @@ object DataSourceWriteOptions {
val DROP_PARTITION_COLUMNS: ConfigProperty[java.lang.Boolean] =
HoodieTableConfig.DROP_PARTITION_COLUMNS
+ val ENABLE_OPTIMIZED_UPDATE: ConfigProperty[String] = ConfigProperty
+ .key("hoodie.enable.spark.sql.optimized.update")
+ .defaultValue("true")
Review Comment:
we should add sinceVersion as well
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestUpdateTable.scala:
##########
@@ -48,235 +48,14 @@ class TestUpdateTable extends HoodieSparkSqlTestBase {
Seq(1, "a1", 10.0, 1000)
)
- // update data
- spark.sql(s"update $tableName set price = 20 where id = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 20.0, 1000)
- )
-
- // update data
- spark.sql(s"update $tableName set price = price * 2 where id = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 40.0, 1000)
- )
- }
- })
- }
-
- test("Test Update Table Without Primary Key") {
- withRecordType()(withTempDir { tmp =>
- Seq("cow", "mor").foreach { tableType =>
- val tableName = generateTableName
- // create table
- spark.sql(
- s"""
- |create table $tableName (
- | id int,
- | name string,
- | price double,
- | ts long
- |) using hudi
- | location '${tmp.getCanonicalPath}/$tableName'
- | tblproperties (
- | type = '$tableType',
- | preCombineField = 'ts'
- | )
- """.stripMargin)
-
- // insert data to table
- spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 10.0, 1000)
- )
+ spark.sql("set hoodie.enable.spark.sql.optimized.update=false")
// update data
spark.sql(s"update $tableName set price = 20 where id = 1")
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 20.0, 1000)
)
-
- // update data
- spark.sql(s"update $tableName set price = price * 2 where id = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 40.0, 1000)
- )
}
})
}
-
- test("Test Update Table On Non-PK Condition") {
Review Comment:
why removing tests? I don't get the rational
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestUpdateTable.scala:
##########
@@ -48,235 +48,14 @@ class TestUpdateTable extends HoodieSparkSqlTestBase {
Seq(1, "a1", 10.0, 1000)
)
- // update data
- spark.sql(s"update $tableName set price = 20 where id = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 20.0, 1000)
- )
-
- // update data
- spark.sql(s"update $tableName set price = price * 2 where id = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 40.0, 1000)
- )
- }
- })
- }
-
- test("Test Update Table Without Primary Key") {
- withRecordType()(withTempDir { tmp =>
- Seq("cow", "mor").foreach { tableType =>
- val tableName = generateTableName
- // create table
- spark.sql(
- s"""
- |create table $tableName (
- | id int,
- | name string,
- | price double,
- | ts long
- |) using hudi
- | location '${tmp.getCanonicalPath}/$tableName'
- | tblproperties (
- | type = '$tableType',
- | preCombineField = 'ts'
- | )
- """.stripMargin)
-
- // insert data to table
- spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 10.0, 1000)
- )
+ spark.sql("set hoodie.enable.spark.sql.optimized.update=false")
// update data
spark.sql(s"update $tableName set price = 20 where id = 1")
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 20.0, 1000)
)
-
- // update data
- spark.sql(s"update $tableName set price = price * 2 where id = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 40.0, 1000)
- )
}
})
}
-
- test("Test Update Table On Non-PK Condition") {
- withRecordType()(withTempDir { tmp =>
- Seq("cow", "mor").foreach {tableType =>
- /** non-partitioned table */
- val tableName = generateTableName
- // create table
- spark.sql(
- s"""
- |create table $tableName (
- | id int,
- | name string,
- | price double,
- | ts long
- |) using hudi
- | location '${tmp.getCanonicalPath}/$tableName'
- | tblproperties (
- | type = '$tableType',
- | primaryKey = 'id',
- | preCombineField = 'ts'
- | )
- """.stripMargin)
-
- // insert data to table
- if (isSpark2) {
- spark.sql(s"insert into $tableName values (1, 'a1', cast(10.0 as
double), 1000), (2, 'a2', cast(20.0 as double), 1000)")
- } else {
- spark.sql(s"insert into $tableName values (1, 'a1', 10.0, 1000), (2,
'a2', 20.0, 1000)")
- }
-
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 10.0, 1000),
- Seq(2, "a2", 20.0, 1000)
- )
-
- // update data on non-pk condition
- spark.sql(s"update $tableName set price = 11.0, ts = 1001 where name =
'a1'")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 11.0, 1001),
- Seq(2, "a2", 20.0, 1000)
- )
-
- /** partitioned table */
- val ptTableName = generateTableName + "_pt"
- // create table
- spark.sql(
- s"""
- |create table $ptTableName (
- | id int,
- | name string,
- | price double,
- | ts long,
- | pt string
- |) using hudi
- | location '${tmp.getCanonicalPath}/$ptTableName'
- | tblproperties (
- | type = '$tableType',
- | primaryKey = 'id',
- | preCombineField = 'ts'
- | )
- | partitioned by (pt)
- """.stripMargin)
-
- // insert data to table
- if (isSpark2) {
- spark.sql(
- s"""
- |insert into $ptTableName
- |values (1, 'a1', cast(10.0 as double), 1000, "2021"), (2,
'a2', cast(20.0 as double), 1000, "2021"), (3, 'a2', cast(30.0 as double),
1000, "2022")
- |""".stripMargin)
- } else {
- spark.sql(
- s"""
- |insert into $ptTableName
- |values (1, 'a1', 10.0, 1000, "2021"), (2, 'a2', 20.0, 1000,
"2021"), (3, 'a2', 30.0, 1000, "2022")
- |""".stripMargin)
- }
-
- checkAnswer(s"select id, name, price, ts, pt from $ptTableName")(
- Seq(1, "a1", 10.0, 1000, "2021"),
- Seq(2, "a2", 20.0, 1000, "2021"),
- Seq(3, "a2", 30.0, 1000, "2022")
- )
-
- // update data on non-pk condition
- spark.sql(s"update $ptTableName set price = price * 1.1, ts = ts + 1
where name = 'a2'")
- checkAnswer(s"select id, name, price, ts, pt from $ptTableName")(
- Seq(1, "a1", 10.0, 1000, "2021"),
- Seq(2, "a2", 22.0, 1001, "2021"),
- Seq(3, "a2", 33.0, 1001, "2022")
- )
-
- spark.sql(s"update $ptTableName set price = price + 5, ts = ts + 1
where pt = '2021'")
- checkAnswer(s"select id, name, price, ts, pt from $ptTableName")(
- Seq(1, "a1", 15.0, 1001, "2021"),
- Seq(2, "a2", 27.0, 1002, "2021"),
- Seq(3, "a2", 33.0, 1001, "2022")
- )
- }
- })
- }
-
- test("Test ignoring case for Update Table") {
- withRecordType()(withTempDir { tmp =>
- Seq("cow", "mor").foreach {tableType =>
- val tableName = generateTableName
- // create table
- spark.sql(
- s"""
- |create table $tableName (
- | ID int,
- | NAME string,
- | PRICE double,
- | TS long
- |) using hudi
- | location '${tmp.getCanonicalPath}/$tableName'
- | options (
- | type = '$tableType',
- | primaryKey = 'ID',
- | preCombineField = 'TS'
- | )
- """.stripMargin)
- // insert data to table
- spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 10.0, 1000)
- )
-
- // update data
- spark.sql(s"update $tableName set PRICE = 20 where ID = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 20.0, 1000)
- )
-
- // update data
- spark.sql(s"update $tableName set PRICE = PRICE * 2 where ID = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 40.0, 1000)
- )
- }
- })
- }
-
- test("Test decimal type") {
- withTempDir { tmp =>
- val tableName = generateTableName
- // create table
- spark.sql(
- s"""
- |create table $tableName (
- | id int,
- | name string,
- | price double,
- | ts long,
- | ff decimal(38, 10)
- |) using hudi
- | location '${tmp.getCanonicalPath}/$tableName'
- | tblproperties (
- | type = 'mor',
- | primaryKey = 'id',
- | preCombineField = 'ts'
- | )
- """.stripMargin)
-
- // insert data to table
- spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000, 10.0")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 10.0, 1000)
- )
-
- spark.sql(s"update $tableName set price = 22 where id = 1")
- checkAnswer(s"select id, name, price, ts from $tableName")(
- Seq(1, "a1", 22.0, 1000)
- )
- }
- }
Review Comment:
also, can you parametrize some of the update and delete tests for both
cases. Just incase some code changes break the unoptimized flow, our tests
should catch them.
Do not make every test parametrized. But a subset of them
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestUpdateTable.scala:
##########
@@ -23,7 +23,7 @@ class TestUpdateTable extends HoodieSparkSqlTestBase {
test("Test Update Table") {
withRecordType()(withTempDir { tmp =>
- Seq("cow", "mor").foreach { tableType =>
+ Seq("cow").foreach { tableType =>
Review Comment:
why this change ?
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala:
##########
@@ -605,6 +605,18 @@ object DataSourceWriteOptions {
val DROP_PARTITION_COLUMNS: ConfigProperty[java.lang.Boolean] =
HoodieTableConfig.DROP_PARTITION_COLUMNS
+ val ENABLE_OPTIMIZED_UPDATE: ConfigProperty[String] = ConfigProperty
+ .key("hoodie.enable.spark.sql.optimized.update")
+ .defaultValue("true")
+ .markAdvanced()
+ .withDocumentation("Controls whether spark sql optimized update is
enabled.")
+
+ val ENABLE_OPTIMIZED_DELETE: ConfigProperty[String] = ConfigProperty
+ .key("hoodie.enable.spark.sql.optimized.delete")
Review Comment:
lets try to follow our usual way of naming configs. generally we try to keep
"enable" in the end.
"hoodie.spark.sql.writes.optimized.updates.enable"
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/HoodieSparkSqlTestBase.scala:
##########
@@ -195,7 +195,7 @@ class HoodieSparkSqlTestBase extends FunSuite with
BeforeAndAfterAll {
protected def withRecordType(recordConfig: Map[HoodieRecordType, Map[String,
String]]=Map.empty)(f: => Unit) {
// TODO HUDI-5264 Test parquet log with avro record in spark sql test
- Seq(HoodieRecordType.AVRO, HoodieRecordType.SPARK).foreach { recordType =>
+ Seq(HoodieRecordType.AVRO).foreach { recordType =>
Review Comment:
why this change?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]