Re: [PR] [HUDI-8018] Improve Spark SQL tests [hudi]

via GitHub Mon, 09 Sep 2024 17:45:25 -0700


yihua commented on code in PR #11859:
URL: https://github.com/apache/hudi/pull/11859#discussion_r1751092092



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestCompactionTable.scala:
##########
@@ -136,4 +136,74 @@ class TestCompactionTable extends HoodieSparkSqlTestBase {
       )
     }
   }
+
+  test("Test compaction before and after deletes") {
+    withTempDir { tmp =>
+      val tableName = generateTableName
+      spark.sql(
+        s"""
+           |create table $tableName (
+           |  id int,
+           |  name string,
+           |  price double,
+           |  ts long
+           |) using hudi
+           | location '${tmp.getCanonicalPath}'
+           | tblproperties (
+           |  primaryKey ='id',
+           |  type = 'mor',
+           |  preCombineField = 'ts'
+           | )
+       """.stripMargin)
+      spark.sql("set hoodie.parquet.max.file.size = 10000")
+      // disable automatic inline compaction
+      spark.sql("set hoodie.compact.inline=false")
+      spark.sql("set hoodie.compact.schedule.inline=false")
+      // set compaction frequency to every 2 commits
+      spark.sql("set hoodie.compact.inline.max.delta.commits=2")
+      // insert data
+      spark.sql(s"insert into $tableName values(1, 'a1', 10, 1000)")
+      spark.sql(s"insert into $tableName values(2, 'a2', 10, 1000)")
+      spark.sql(s"insert into $tableName values(3, 'a3', 10, 1000)")
+      // update data
+      spark.sql(s"update $tableName set price = 11 where id = 1")
+      // update data
+      spark.sql(s"update $tableName set price = 12 where id = 2")
+      // schedule compaction
+      spark.sql(s"schedule compaction  on $tableName")

Review Comment:
   ```suggestion
         spark.sql(s"schedule compaction on $tableName")
   ```



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestAutoGenerationOfRecordKeys.scala:
##########
@@ -264,12 +267,14 @@ class TestAutoGenerationOfRecordKeys extends 
HoodieSparkClientTestBase with Scal
     assertEquals(5, snapshot0.count())
   }
 
-  @Test
-  def testUpsertsAndDeletesWithPkLess(): Unit = {
+  @ParameterizedTest
+  @EnumSource(value = classOf[HoodieTableType])
+  def testUpsertsAndDeletesWithPkLess(tableType: HoodieTableType): Unit = {

Review Comment:
   IMO we can keep this test on both COW and MOR and leave others untouched.



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlWithTimestampKeyGenerator.scala:
##########
@@ -102,29 +102,32 @@ class TestSparkSqlWithTimestampKeyGenerator extends 
HoodieSparkSqlTestBase {
   }
 
   test("Test mandatory partitioning for timestamp key generator") {
-    withTempDir { tmp =>
-      spark.sql(
-        s"""
-           | CREATE TABLE should_fail (
-           |   id int,
-           |   name string,
-           |   precomb long,
-           |   ts long
-           | ) USING HUDI
-           | LOCATION '${tmp.getCanonicalPath + "/should_fail"}'
-           | TBLPROPERTIES (
-           |   type = 'COPY_ON_WRITE',
-           |   primaryKey = 'id',
-           |   preCombineField = 'precomb',
-           |   hoodie.table.keygenerator.class = 
'org.apache.hudi.keygen.TimestampBasedKeyGenerator',
-           |   ${timestampKeyGeneratorSettings.head}
-           | )
-           |""".stripMargin)
-      // should fail due to absent partitioning
-      assertThrows[HoodieException] {
-        spark.sql(s"INSERT INTO should_fail VALUES 
${dataBatchesWithLongOfSeconds(0)}")
-      }
+    Seq("cow", "mor").foreach { tableType =>

Review Comment:
   Partitioning should have nothing to do with table type, so we can keep this 
unchanged?  I saw that the `hudi-spark` test time is increased a lot.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8018] Improve Spark SQL tests [hudi]

Reply via email to