[GitHub] [hudi] nsivabalan commented on a change in pull request #2776: [HUDI-1768] spark datasource support schema validate add column

GitBox Tue, 06 Apr 2021 19:52:20 -0700


nsivabalan commented on a change in pull request #2776:
URL: https://github.com/apache/hudi/pull/2776#discussion_r608304445




##########
File path: 
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala
##########
@@ -669,4 +669,90 @@ class TestCOWDataSource extends HoodieClientTestBase {
       .load(basePath)
     assertEquals(N + 1, hoodieIncViewDF1.count())
   }
+
+  @Test def testSchemaEvolution(): Unit = {
+    // open the schema validate
+    val  opts = commonOpts ++ Map("hoodie.avro.schema.validate" -> "true")
+    // 1. write records with schema1
+    val schema1 = StructType(StructField("_row_key", StringType, true) :: 
StructField("name", StringType, true)::
+      StructField("timestamp", IntegerType, true) :: StructField("partition", 
IntegerType, true)::Nil)
+    val records1 = Seq(Row("1", "Andy", 1, 1),
+      Row("2", "lisi", 1, 1),
+      Row("3", "zhangsan", 1, 1))
+    val rdd = jsc.parallelize(records1)
+    val  recordsDF = spark.createDataFrame(rdd, schema1)
+    recordsDF.write.format("org.apache.hudi")
+      .options(opts)
+      .mode(SaveMode.Append)
+      .save(basePath)
+
+
+    // 2. write records with schema2 add column age
+    val schema2 = StructType(StructField("_row_key", StringType, true) :: 
StructField("name", StringType, true) ::
+      StructField("age", StringType, true) :: StructField("timestamp", 
IntegerType, true) ::
+      StructField("partition", IntegerType, true)::Nil)
+
+    val records2 = Seq(Row("11", "Andy", "10", 1, 1),
+      Row("22", "lisi", "11",1, 1),
+      Row("33", "zhangsan", "12", 1, 1))
+    val rdd2 = jsc.parallelize(records2)
+    val  recordsDF2 = spark.createDataFrame(rdd2, schema2)
+    recordsDF2.write.format("org.apache.hudi")
+      .options(opts)
+      .mode(SaveMode.Append)
+      .save(basePath)
+
+    val recordsReadDF = spark.read.format("org.apache.hudi")
+      .load(basePath + "/*/*")
+    recordsReadDF.show(false)
+    val resultSchema = new StructType(recordsReadDF.schema.filter(p=> 
!p.name.startsWith("_hoodie")).toArray)
+    assertEquals(resultSchema, schema2)
+
+    // 3. write records with schema3 delete column name
+    try {
+      val schema3 = StructType(StructField("_row_key", StringType, true) ::
+        StructField("age", StringType, true) :: StructField("timestamp", 
IntegerType, true) ::
+        StructField("partition", IntegerType, true)::Nil)
+
+      val records3 = Seq(Row("11", "10", 1, 1),
+        Row("22", "11",1, 1),
+        Row("33", "12", 1, 1))
+      val rdd3 = jsc.parallelize(records3)
+      val  recordsDF3 = spark.createDataFrame(rdd3, schema3)
+      recordsDF3.write.format("org.apache.hudi")
+        .options(opts)
+        .mode(SaveMode.Append)
+        .save(basePath)
+      fail("Delete column should fail")
+    } catch {
+      case ex: HoodieUpsertException =>
+        assertTrue(ex.getMessage.equals("Failed upsert schema compatibility 
check."))
+    }
+  }
+
+
+  @Test def testSchemaNotEqualData(): Unit = {

Review comment:
       can you confirm if my understanding of this test case is right. 
   when ingested batch has lesser columns compared to the schema passed in, 
writes should still succeed and the table schema should be the schema passed 
and not the ingested df's schema. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] nsivabalan commented on a change in pull request #2776: [HUDI-1768] spark datasource support schema validate add column

Reply via email to