[GitHub] [hudi] danny0405 commented on a diff in pull request #8990: [HUDI-6387] Fix Spark CDC when delete all records from a filegroup

via GitHub Thu, 15 Jun 2023 18:52:28 -0700


danny0405 commented on code in PR #8990:
URL: https://github.com/apache/hudi/pull/8990#discussion_r1231697487



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCDCForSparkSQL.scala:
##########
@@ -44,6 +44,47 @@ class TestCDCForSparkSQL extends HoodieSparkSqlTestBase {
     assertEquals(expectedDeletedCnt, cdcData.where("op = 'd'").count())
   }
 
+  test("Test delete all records in filegroup") {
+    withTempDir { tmp =>
+      val databaseName = "hudi_database"
+      spark.sql(s"create database if not exists $databaseName")
+      spark.sql(s"use $databaseName")
+      val tableName = generateTableName
+      val basePath = s"${tmp.getCanonicalPath}/$tableName"
+      spark.sql(
+        s"""
+           | create table $tableName (
+           |  id int,
+           |  name string,
+           |  price double,
+           |  ts long
+           | ) using hudi
+           | partitioned by (name)
+           | tblproperties (
+           |   'primaryKey' = 'id',
+           |   'preCombineField' = 'ts',
+           |   'hoodie.table.cdc.enabled' = 'true',
+           |   'hoodie.table.cdc.supplemental.logging.mode' = 
'$DATA_BEFORE_AFTER',
+           |   type = 'cow'
+           | )
+           | location '$basePath'
+      """.stripMargin)
+      val metaClient = HoodieTableMetaClient.builder()
+        .setBasePath(basePath)
+        .setConf(spark.sessionState.newHadoopConf())
+        .build()
+      spark.sql(s"insert into $tableName values (1, 11, 1000, 'a1'), (2, 12, 
1000, 'a2')")
+      val commitTime1 = 
metaClient.reloadActiveTimeline.lastInstant().get().getTimestamp
+      val cdcDataOnly1 = cdcDataFrame(basePath, commitTime1.toLong - 1)
+      cdcDataOnly1.show(false)
+      assertCDCOpCnt(cdcDataOnly1, 2, 0, 0)
+
+      spark.sql(s"delete from $tableName where id = 1")
+      val cdcDataOnly2 = cdcDataFrame(basePath, commitTime1.toLong)
+      assertCDCOpCnt(cdcDataOnly2, 0, 0, 1)

Review Comment:
   +1



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] danny0405 commented on a diff in pull request #8990: [HUDI-6387] Fix Spark CDC when delete all records from a filegroup

Reply via email to