[GitHub] [hudi] KnightChess commented on a diff in pull request #8072: [HUDI-5857] Index overwrite into bucket table would generate new file group id

via GitHub Tue, 28 Feb 2023 07:22:37 -0800


KnightChess commented on code in PR #8072:
URL: https://github.com/apache/hudi/pull/8072#discussion_r1120241446



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala:
##########
@@ -1125,4 +1123,152 @@ class TestInsertTable extends HoodieSparkSqlTestBase {
       }
     }
   }
+
+  test("Test Insert Overwrite Into Bucket Index COW Table") {
+    withRecordType()(withTempDir { tmp =>
+      val tableName = generateTableName
+      // Create a partitioned table
+      spark.sql(
+        s"""
+           |create table $tableName (
+           |  id int,
+           |  name string,
+           |  price double,
+           |  ts long,
+           |  dt string
+           |) using hudi
+           |tblproperties (
+           |  type = 'cow',
+           |  primaryKey = 'id',
+           |  preCombineField = 'ts',
+           |  hoodie.index.type = 'BUCKET',
+           |  hoodie.bucket.index.num.buckets = '4'
+           |)
+           | partitioned by (dt)
+           | location '${tmp.getCanonicalPath}/$tableName'
+       """.stripMargin)
+
+      spark.sql(
+        s"""insert into $tableName  values
+           |(5, 'a', 35, 1000, '2021-01-05'),
+           |(1, 'a', 31, 1000, '2021-01-05'),
+           |(3, 'a', 33, 1000, '2021-01-05'),
+           |(4, 'b', 16, 1000, '2021-01-05'),
+           |(2, 'b', 18, 1000, '2021-01-05'),
+           |(6, 'b', 17, 1000, '2021-01-05'),
+           |(8, 'a', 21, 1000, '2021-01-05'),
+           |(9, 'a', 22, 1000, '2021-01-05'),
+           |(7, 'a', 23, 1000, '2021-01-05')
+           |""".stripMargin)
+
+      // Insert overwrite static partition
+      spark.sql(
+        s"""
+           | insert overwrite table $tableName partition(dt = '2021-01-05')
+           | select * from (select 13 , 'a2', 12, 1000) limit 10
+        """.stripMargin)
+      checkAnswer(s"select id, name, price, ts, dt from $tableName order by 
dt")(
+        Seq(13, "a2", 12.0, 1000, "2021-01-05")
+      )
+    })
+  }
+
+  test("Test Insert Overwrite Into Bucket Index MOR Table") {
+    withRecordType()(withTempDir { tmp =>
+      val tableName = generateTableName
+      // Create a partitioned table
+      spark.sql(
+        s"""
+           |create table $tableName (
+           |  id int,
+           |  name string,
+           |  price double,
+           |  ts long,
+           |  dt string
+           |) using hudi
+           |tblproperties (
+           |  type = 'mor',
+           |  primaryKey = 'id',
+           |  preCombineField = 'ts',
+           |  hoodie.index.type = 'BUCKET',
+           |  hoodie.bucket.index.num.buckets = '4'
+           |)
+           | partitioned by (dt)
+           | location '${tmp.getCanonicalPath}/$tableName'
+       """.stripMargin)
+
+      spark.sql(
+        s"""insert into $tableName  values
+           |(5, 'a', 35, 1000, '2021-01-05'),
+           |(1, 'a', 31, 1000, '2021-01-05'),
+           |(3, 'a', 33, 1000, '2021-01-05'),
+           |(4, 'b', 16, 1000, '2021-01-05'),
+           |(2, 'b', 18, 1000, '2021-01-05'),
+           |(6, 'b', 17, 1000, '2021-01-05'),
+           |(8, 'a', 21, 1000, '2021-01-05'),
+           |(9, 'a', 22, 1000, '2021-01-05'),
+           |(7, 'a', 23, 1000, '2021-01-05')
+           |""".stripMargin)
+
+      // Insert overwrite static partition
+      spark.sql(
+        s"""
+           | insert overwrite table $tableName partition(dt = '2021-01-05')
+           | select * from (select 13 , 'a2', 12, 1000) limit 10
+        """.stripMargin)
+      checkAnswer(s"select id, name, price, ts, dt from $tableName order by 
dt")(
+        Seq(13, "a2", 12.0, 1000, "2021-01-05")
+      )
+    })
+  }
+
+  test("Test Insert Overwrite Into Consistent Bucket Index Table") {
+    withRecordType()(withTempDir { tmp =>
+      val tableName = generateTableName
+      // Create a partitioned table
+      spark.sql(
+        s"""
+           |create table $tableName (
+           |  id int,
+           |  name string,
+           |  price double,
+           |  ts long,
+           |  dt string
+           |) using hudi
+           |tblproperties (
+           |  type = 'mor',
+           |  primaryKey = 'id',
+           |  preCombineField = 'ts',
+           |  hoodie.index.type = 'BUCKET',
+           |  hoodie.index.bucket.engine = "CONSISTENT_HASHING",
+           |  hoodie.bucket.index.num.buckets = '4'
+           |)
+           | partitioned by (dt)
+           | location '${tmp.getCanonicalPath}/$tableName'
+       """.stripMargin)
+
+      spark.sql(
+        s"""insert into $tableName  values
+           |(5, 'a', 35, 1000, '2021-01-05'),
+           |(1, 'a', 31, 1000, '2021-01-05'),
+           |(3, 'a', 33, 1000, '2021-01-05'),
+           |(4, 'b', 16, 1000, '2021-01-05'),
+           |(2, 'b', 18, 1000, '2021-01-05'),
+           |(6, 'b', 17, 1000, '2021-01-05'),
+           |(8, 'a', 21, 1000, '2021-01-05'),
+           |(9, 'a', 22, 1000, '2021-01-05'),
+           |(7, 'a', 23, 1000, '2021-01-05')
+           |""".stripMargin)
+
+      // Insert overwrite static partition
+      spark.sql(
+        s"""
+           | insert overwrite table $tableName partition(dt = '2021-01-05')

Review Comment:
   this will create a new parquet file with the same prefix against log file, 
but something diff in fgId suffix. just like the picture, create new parquet 
file will add `-0` after fgId(xxx-0-0_xxx), so it can be read if only insert 
overwrite onece, but if  insert overwrite again, will use the same 
fgId(xxx-0-0), result nothing.
   <img width="588" alt="image" 
src="https://user-images.githubusercontent.com/20125927/221895955-195a00a6-7eda-4b4a-ac3d-c6e5f10bfd27.png";>
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] KnightChess commented on a diff in pull request #8072: [HUDI-5857] Index overwrite into bucket table would generate new file group id

Reply via email to