Re: [PR] [HUDI-8394] Restrict same partition multiple bulk inserts in append mode into COW with bucket index [hudi]

via GitHub Fri, 15 Nov 2024 02:26:07 -0800


geserdugarov commented on code in PR #12245:
URL: https://github.com/apache/hudi/pull/12245#discussion_r1843553364



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestInsertTable.scala:
##########
@@ -1668,47 +1668,99 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
               Seq(3, "a3,3", 30.0, 3000, "2021-01-07")
             )
 
-            spark.sql(
-              s"""
-                 | insert into $tableName values
-                 | (1, 'a1', 10, 1000, "2021-01-05"),
-                 | (3, "a3", 30, 3000, "2021-01-07")
-               """.stripMargin)
+            // for COW table append bulk insert multiple times is restricted
+            if (tableType != "cow") {
+              spark.sql(
+                s"""
+                   | insert into $tableName values
+                   | (1, 'a1', 10, 1000, "2021-01-05"),
+                   | (3, "a3", 30, 3000, "2021-01-07")
+                 """.stripMargin)
 
-            checkAnswer(s"select id, name, price, ts, dt from $tableName")(
-              Seq(1, "a1,1", 10.0, 1000, "2021-01-05"),
-              Seq(1, "a1", 10.0, 1000, "2021-01-05"),
-              Seq(2, "a2", 20.0, 2000, "2021-01-06"),
-              Seq(3, "a3,3", 30.0, 3000, "2021-01-07"),
-              Seq(3, "a3", 30.0, 3000, "2021-01-07")
-            )
+              checkAnswer(s"select id, name, price, ts, dt from $tableName")(
+                Seq(1, "a1,1", 10.0, 1000, "2021-01-05"),
+                Seq(1, "a1", 10.0, 1000, "2021-01-05"),
+                Seq(2, "a2", 20.0, 2000, "2021-01-06"),
+                Seq(3, "a3,3", 30.0, 3000, "2021-01-07"),
+                Seq(3, "a3", 30.0, 3000, "2021-01-07")
+              )
 
-            // there are two files in partition(dt = '2021-01-05')
-            checkAnswer(s"select count(distinct _hoodie_file_name) from 
$tableName where dt = '2021-01-05'")(
-              Seq(2)
-            )
+              // there are two files in partition(dt = '2021-01-05')
+              checkAnswer(s"select count(distinct _hoodie_file_name) from 
$tableName where dt = '2021-01-05'")(
+                Seq(2)
+              )
 
-            // would generate 6 other files in partition(dt = '2021-01-05')
-            spark.sql(
-              s"""
-                 | insert into $tableName values
-                 | (4, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (5, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (6, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (7, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (8, 'a1,1', 10, 1000, "2021-01-05"),
-                 | (10, 'a3,3', 30, 3000, "2021-01-05")
-               """.stripMargin)
-
-            checkAnswer(s"select count(distinct _hoodie_file_name) from 
$tableName where dt = '2021-01-05'")(
-              Seq(8)
-            )
+              // would generate 6 other files in partition(dt = '2021-01-05')
+              spark.sql(
+                s"""
+                   | insert into $tableName values
+                   | (4, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (5, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (6, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (7, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (8, 'a1,1', 10, 1000, "2021-01-05"),
+                   | (10, 'a3,3', 30, 3000, "2021-01-05")
+                 """.stripMargin)
+
+              checkAnswer(s"select count(distinct _hoodie_file_name) from 
$tableName where dt = '2021-01-05'")(
+                Seq(8)
+              )
+            }
           }
         }
       }
     }
   }
 
+  test("Test not supported double Bulk Insert Into Bucket Index COW Table") {
+    withSQLConf("hoodie.datasource.write.operation" -> "bulk_insert", 
"hoodie.bulkinsert.shuffle.parallelism" -> "1") {
+      withTempDir { tmp =>
+        val tableName = generateTableName
+        spark.sql(
+          s"""
+             |create table $tableName (
+             |  id int,
+             |  dt string,
+             |  name string,
+             |  price double,
+             |  ts long
+             |) using hudi
+             | tblproperties (
+             | primaryKey = 'id,name',
+             | type = 'cow',
+             | preCombineField = 'ts',
+             | hoodie.index.type = 'BUCKET',
+             | hoodie.index.bucket.engine = 'SIMPLE',
+             | hoodie.bucket.index.num.buckets = '2',
+             | hoodie.bucket.index.hash.field = 'id,name',
+             | hoodie.datasource.write.row.writer.enable = 'false')
+             | partitioned by (dt)
+             | location '${tmp.getCanonicalPath}'
+             """.stripMargin)
+        spark.sql(
+          s"""
+             | insert into $tableName values
+             | (5, 'a1,1', 10, 1000, "2021-01-05")
+             """.stripMargin)
+        checkExceptionContain(
+          s"""
+             | insert into $tableName values
+             | (9, 'a3,3', 30, 3000, "2021-01-05")

Review Comment:
   @KnightChess , I've added test of other partitions:
   
https://github.com/apache/hudi/pull/12245/files#diff-2c06898918dddc19f0b90ecd806d81faf88f30d0ae7bb8be509019b6db9c2accR1741



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8394] Restrict same partition multiple bulk inserts in append mode into COW with bucket index [hudi]

Reply via email to