Re: [PR] [HUDI-8394] Restrict multiple bulk inserts into simple bucket COW with disabled Spark native Row [hudi]

via GitHub Mon, 25 Nov 2024 00:26:52 -0800


wombatu-kun commented on code in PR #12245:
URL: https://github.com/apache/hudi/pull/12245#discussion_r1856094849



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestInsertTable.scala:
##########
@@ -1668,47 +1668,102 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
               Seq(3, "a3,3", 30.0, 3000, "2021-01-07")
             )
 
-            spark.sql(
-              s"""
-                 | insert into $tableName values
-                 | (1, 'a1', 10, 1000, "2021-01-05"),
-                 | (3, "a3", 30, 3000, "2021-01-07")
-               """.stripMargin)
+            // for COW table append bulk insert multiple times is restricted
+            if (tableType != "cow") {
+              spark.sql(
+                s"""
+                   | insert into $tableName values
+                   | (1, 'a1', 10, 1000, "2021-01-05"),
+                   | (3, "a3", 30, 3000, "2021-01-07")
+                 """.stripMargin)
 
-            checkAnswer(s"select id, name, price, ts, dt from $tableName")(
-              Seq(1, "a1,1", 10.0, 1000, "2021-01-05"),
-              Seq(1, "a1", 10.0, 1000, "2021-01-05"),
-              Seq(2, "a2", 20.0, 2000, "2021-01-06"),
-              Seq(3, "a3,3", 30.0, 3000, "2021-01-07"),
-              Seq(3, "a3", 30.0, 3000, "2021-01-07")
-            )
+              checkAnswer(s"select id, name, price, ts, dt from $tableName")(
+                Seq(1, "a1,1", 10.0, 1000, "2021-01-05"),
+                Seq(1, "a1", 10.0, 1000, "2021-01-05"),
+                Seq(2, "a2", 20.0, 2000, "2021-01-06"),
+                Seq(3, "a3,3", 30.0, 3000, "2021-01-07"),
+                Seq(3, "a3", 30.0, 3000, "2021-01-07")
+              )
 
-            // there are two files in partition(dt = '2021-01-05')
-            checkAnswer(s"select count(distinct _hoodie_file_name) from 
$tableName where dt = '2021-01-05'")(
-              Seq(2)
-            )
+              // there are two files in partition(dt = '2021-01-05')
+              checkAnswer(s"select count(distinct _hoodie_file_name) from 
$tableName where dt = '2021-01-05'")(
+                Seq(2)
+              )
 
-            // would generate 6 other files in partition(dt = '2021-01-05')
-            spark.sql(
-              s"""
-                 | insert into $tableName values
-                 | (4, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (5, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (6, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (7, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (8, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (10, 'a3,3', 30, 3000, "2021-01-05")
-               """.stripMargin)
-
-            checkAnswer(s"select count(distinct _hoodie_file_name) from 
$tableName where dt = '2021-01-05'")(
-              Seq(8)
-            )
+              // would generate 6 other files in partition(dt = '2021-01-05')
+              spark.sql(
+                s"""
+                   | insert into $tableName values
+                   | (4, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (5, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (6, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (7, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (8, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (10, 'a3,3', 30, 3000, "2021-01-05")
+                 """.stripMargin)
+
+              checkAnswer(s"select count(distinct _hoodie_file_name) from 
$tableName where dt = '2021-01-05'")(
+                Seq(8)
+              )
+            }
           }
         }
       }
     }
   }
 
+  test("Test not supported multiple BULK INSERTs into SIMPLE BUCKET for COW 
without Spark native Row") {
+    withSQLConf("hoodie.datasource.write.operation" -> "bulk_insert",
+      "hoodie.bulkinsert.shuffle.parallelism" -> "1") {
+      withTempDir { tmp =>
+        val tableName = generateTableName
+        spark.sql(
+          s"""
+             |create table $tableName (
+             |  id long,
+             |  name string,
+             |  ts int,
+             |  par string
+             |) using hudi
+             | tblproperties (
+             | primaryKey = 'id,name',
+             | type = 'cow',
+             | preCombineField = 'ts',
+             | hoodie.index.type = 'BUCKET',
+             | hoodie.index.bucket.engine = 'SIMPLE',
+             | hoodie.bucket.index.num.buckets = '4',
+             | hoodie.bucket.index.hash.field = 'id,name',
+             | hoodie.datasource.write.row.writer.enable = 'false')
+             | partitioned by (par)
+             | location '${tmp.getCanonicalPath}'
+             """.stripMargin)
+        //   `id,name` -> `bucketId` if there are 4 buckets
+        //    5,'a1,1' ->    1
+        //    6,'a6,6' ->    2
+        //    9,'a3,3' ->    1
+        // 13,'a13,13' ->    2
+        //     24,'cd' ->    0
+        // buckets 1 & 2 into partition 'main', bucket 1 into partition 'side'
+        spark.sql(s"insert into $tableName values (5, 'a1,1', 1, 'main'), (6, 
'a6,6', 1, 'main'), (9, 'a3,3', 1, 'side')")
+        // bucket 1 into 'main', bucket 2 into 'side', the whole insert will 
fail due to existed bucket 1 in 'main'
+        val causeRegex = "Multiple bulk insert.* COW.* not supported.*"
+        checkExceptionMatch(s"insert into $tableName values (9, 'a3,3', 2, 
'main'), (13, 'a13,13', 1, 'side')")(causeRegex)

Review Comment:
   > And in the case of `ENABLE_ROW_WRITER = true` we use 
`BaseDatasetBulkInsertCommitActionExecutor` instead of 
`SparkBulkInsertCommitActionExecutor`.
   
   As i see from documentation and @Danny's comment 
(https://github.com/apache/hudi/pull/12245#discussion_r1851871655), bulk_insert 
operation is "for initial loading/bootstrapping a Hudi table at first" 
(https://hudi.apache.org/docs/write_operations#bulk_insert), so it should be 
used only on empty tables. If you've already made one bulk_insert - the table 
is not empty anymore, so all subsequent operations must be upserts to avoid any 
potential issues with data consistency.
   So my opinion is: for COW tables with Simple Bucket index we should throw 
exception on attempting to bulk_insert  to non-empty table, no matter what 
`ENABLE_ROW_WRITER` value is configured.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8394] Restrict multiple bulk inserts into simple bucket COW with disabled Spark native Row [hudi]

Reply via email to