geserdugarov commented on code in PR #12245: URL: https://github.com/apache/hudi/pull/12245#discussion_r1855392037
########## hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestInsertTable.scala: ########## @@ -1668,47 +1668,102 @@ class TestInsertTable extends HoodieSparkSqlTestBase { Seq(3, "a3,3", 30.0, 3000, "2021-01-07") ) - spark.sql( - s""" - | insert into $tableName values - | (1, 'a1', 10, 1000, "2021-01-05"), - | (3, "a3", 30, 3000, "2021-01-07") - """.stripMargin) + // for COW table append bulk insert multiple times is restricted + if (tableType != "cow") { + spark.sql( + s""" + | insert into $tableName values + | (1, 'a1', 10, 1000, "2021-01-05"), + | (3, "a3", 30, 3000, "2021-01-07") + """.stripMargin) - checkAnswer(s"select id, name, price, ts, dt from $tableName")( - Seq(1, "a1,1", 10.0, 1000, "2021-01-05"), - Seq(1, "a1", 10.0, 1000, "2021-01-05"), - Seq(2, "a2", 20.0, 2000, "2021-01-06"), - Seq(3, "a3,3", 30.0, 3000, "2021-01-07"), - Seq(3, "a3", 30.0, 3000, "2021-01-07") - ) + checkAnswer(s"select id, name, price, ts, dt from $tableName")( + Seq(1, "a1,1", 10.0, 1000, "2021-01-05"), + Seq(1, "a1", 10.0, 1000, "2021-01-05"), + Seq(2, "a2", 20.0, 2000, "2021-01-06"), + Seq(3, "a3,3", 30.0, 3000, "2021-01-07"), + Seq(3, "a3", 30.0, 3000, "2021-01-07") + ) - // there are two files in partition(dt = '2021-01-05') - checkAnswer(s"select count(distinct _hoodie_file_name) from $tableName where dt = '2021-01-05'")( - Seq(2) - ) + // there are two files in partition(dt = '2021-01-05') + checkAnswer(s"select count(distinct _hoodie_file_name) from $tableName where dt = '2021-01-05'")( + Seq(2) + ) - // would generate 6 other files in partition(dt = '2021-01-05') - spark.sql( - s""" - | insert into $tableName values - | (4, 'a1,1', 10, 1000, "2021-01-05"), - | (5, 'a1,1', 10, 1000, "2021-01-05"), - | (6, 'a1,1', 10, 1000, "2021-01-05"), - | (7, 'a1,1', 10, 1000, "2021-01-05"), - | (8, 'a1,1', 10, 1000, "2021-01-05"), - | (10, 'a3,3', 30, 3000, "2021-01-05") - """.stripMargin) - - checkAnswer(s"select count(distinct _hoodie_file_name) from $tableName where dt = '2021-01-05'")( - Seq(8) - ) + // would generate 6 other files in partition(dt = '2021-01-05') + spark.sql( + s""" + | insert into $tableName values + | (4, 'a1,1', 10, 1000, "2021-01-05"), + | (5, 'a1,1', 10, 1000, "2021-01-05"), + | (6, 'a1,1', 10, 1000, "2021-01-05"), + | (7, 'a1,1', 10, 1000, "2021-01-05"), + | (8, 'a1,1', 10, 1000, "2021-01-05"), + | (10, 'a3,3', 30, 3000, "2021-01-05") + """.stripMargin) + + checkAnswer(s"select count(distinct _hoodie_file_name) from $tableName where dt = '2021-01-05'")( + Seq(8) + ) + } } } } } } + test("Test not supported multiple BULK INSERTs into SIMPLE BUCKET for COW without Spark native Row") { + withSQLConf("hoodie.datasource.write.operation" -> "bulk_insert", + "hoodie.bulkinsert.shuffle.parallelism" -> "1") { + withTempDir { tmp => + val tableName = generateTableName + spark.sql( + s""" + |create table $tableName ( + | id long, + | name string, + | ts int, + | par string + |) using hudi + | tblproperties ( + | primaryKey = 'id,name', + | type = 'cow', + | preCombineField = 'ts', + | hoodie.index.type = 'BUCKET', + | hoodie.index.bucket.engine = 'SIMPLE', + | hoodie.bucket.index.num.buckets = '4', + | hoodie.bucket.index.hash.field = 'id,name', + | hoodie.datasource.write.row.writer.enable = 'false') + | partitioned by (par) + | location '${tmp.getCanonicalPath}' + """.stripMargin) + // `id,name` -> `bucketId` if there are 4 buckets + // 5,'a1,1' -> 1 + // 6,'a6,6' -> 2 + // 9,'a3,3' -> 1 + // 13,'a13,13' -> 2 + // 24,'cd' -> 0 + // buckets 1 & 2 into partition 'main', bucket 1 into partition 'side' + spark.sql(s"insert into $tableName values (5, 'a1,1', 1, 'main'), (6, 'a6,6', 1, 'main'), (9, 'a3,3', 1, 'side')") + // bucket 1 into 'main', bucket 2 into 'side', the whole insert will fail due to existed bucket 1 in 'main' + val causeRegex = "Multiple bulk insert.* COW.* not supported.*" + checkExceptionMatch(s"insert into $tableName values (9, 'a3,3', 2, 'main'), (13, 'a13,13', 1, 'side')")(causeRegex) Review Comment: It's because we have separate processing for this case, and I also added corresponding task HUDI-7757 previously to resolve this hidden branching in the middle of huge `HoodieSparkSqlWriter::writeInternal`: https://github.com/apache/hudi/blob/f4e810b4db3d1c5d01ed24488fe603c86c3f2f36/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala#L488-L493 but didn't found quick solution then in the past. And faced it again now. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org