Re: [PR] [HUDI-8664] adapt TestSparkSqlCoreFlow for hudi stream API [hudi]

via GitHub Wed, 08 Jan 2025 16:19:32 -0800


yihua commented on code in PR #12602:
URL: https://github.com/apache/hudi/pull/12602#discussion_r1908007317



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlCoreFlow.scala:
##########
@@ -91,87 +90,101 @@ class TestSparkSqlCoreFlow extends HoodieSparkSqlTestBase {
     val dataGen = new 
HoodieTestDataGenerator(HoodieTestDataGenerator.TRIP_NESTED_EXAMPLE_SCHEMA, 
0xDEED)
 
     //Bulk insert first set of records
-    val inputDf0 = generateInserts(dataGen, "000", 100).cache()
+    val inputDf0 = generateInserts(dataGen, "000", 10).cache()
     insertInto(tableName, tableBasePath, inputDf0, BULK_INSERT, 
isMetadataEnabled, 1)
+    val inputDf0Rows = canonicalizeDF(inputDf0).collect()
+    inputDf0.unpersist(true)
     assertTrue(hasNewCommits(fs, tableBasePath, "000"))
     //Verify bulk insert works correctly
-    val snapshotDf1 = doSnapshotRead(tableName, isMetadataEnabled).cache()
-    assertEquals(100, snapshotDf1.count())
-    compareEntireInputDfWithHudiDf(inputDf0, snapshotDf1)
-    snapshotDf1.unpersist(true)
+    val snapshotDf1 = doSnapshotRead(tableName, isMetadataEnabled)
+    val snapshotDf1Rows = 
canonicalizeDF(dropMetaColumns(snapshotDf1)).collect()
+    assertEquals(10, snapshotDf1.count())
+    compareEntireInputRowsWithHudiRows(inputDf0Rows, snapshotDf1Rows)
 
     //Test updated records
-    val updateDf = generateUniqueUpdates(dataGen, "001", 50).cache()
+    val updateDf = generateUniqueUpdates(dataGen, "001", 5).cache()
     insertInto(tableName, tableBasePath, updateDf, UPSERT, isMetadataEnabled, 
2)
-    val commitInstantTime2 = latestCommit(fs, tableBasePath)
-    val snapshotDf2 = doSnapshotRead(tableName, isMetadataEnabled).cache()
-    assertEquals(100, snapshotDf2.count())
-    compareUpdateDfWithHudiDf(updateDf, snapshotDf2, snapshotDf1)
-    snapshotDf2.unpersist(true)
+    val commitCompletedInstant2 = latestCompletedCommitCompletionTime(fs, 
tableBasePath)

Review Comment:
   ```suggestion
       val commitInstant2 = latestCompletedCommitCompletionTime(fs, 
tableBasePath)
   ```



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlCoreFlow.scala:
##########
@@ -91,87 +90,101 @@ class TestSparkSqlCoreFlow extends HoodieSparkSqlTestBase {
     val dataGen = new 
HoodieTestDataGenerator(HoodieTestDataGenerator.TRIP_NESTED_EXAMPLE_SCHEMA, 
0xDEED)
 
     //Bulk insert first set of records
-    val inputDf0 = generateInserts(dataGen, "000", 100).cache()
+    val inputDf0 = generateInserts(dataGen, "000", 10).cache()

Review Comment:
   same for other places



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8664] adapt TestSparkSqlCoreFlow for hudi stream API [hudi]

Reply via email to