Re: [PR] [SPARK-54157][SQL] Fix refresh of DSv2 tables in Dataset [spark]

via GitHub Thu, 06 Nov 2025 11:17:18 -0800


cloud-fan commented on code in PR #52920:
URL: https://github.com/apache/spark/pull/52920#discussion_r2500483463



##########
sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala:
##########
@@ -1057,4 +1058,257 @@ class DataSourceV2DataFrameSuite
         (!compareValue || left.getValue == right.getValue)
     }
   }
+
+  test("SPARK-54157: detect table ID change after DataFrame analysis") {
+    val t = "testcat.ns1.ns2.tbl"
+    val ident = Identifier.of(Array("ns1", "ns2"), "tbl")
+    withTable(t) {
+      sql(s"CREATE TABLE $t (id INT, data STRING) USING foo")
+      sql(s"INSERT INTO $t VALUES (1, 'a'), (2, 'b')")
+
+      // create DataFrame and trigger analysis
+      val df = spark.table(t)
+
+      // capture original table
+      val originalTable = catalog("testcat").loadTable(ident)
+      val originalId = originalTable.id()
+
+      // drop and recreate table with same name and schema
+      sql(s"DROP TABLE $t")
+      sql(s"CREATE TABLE $t (id INT, data STRING) USING foo")
+      sql(s"INSERT INTO $t VALUES (3, 'c')")
+
+      // load new table
+      val newTable = catalog("testcat").loadTable(ident)
+      val newId = newTable.id()
+
+      // verify IDs are different
+      assert(originalId != newId)
+
+      // execution should fail with table ID mismatch
+      checkError(
+        exception = intercept[AnalysisException] { df.collect() },
+        condition = 
"INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS.TABLE_ID_MISMATCH",
+        sqlState = Some("51024"),
+        parameters = Map(
+          "tableName" -> "`testcat`.`ns1`.`ns2`.`tbl`",
+          "capturedTableId" -> originalId,
+          "detectedTableId" -> newId))
+    }
+  }
+
+  test("SPARK-54157: detect column removal after DataFrame analysis") {
+    val t = "testcat.ns1.ns2.tbl"
+    withTable(t) {
+      sql(s"CREATE TABLE $t (id INT, data STRING, extra STRING) USING foo")
+      sql(s"INSERT INTO $t VALUES (1, 'a', 'x')")
+
+      // create DataFrame and trigger analysis
+      val df = spark.table(t).select($"id", $"data", $"extra")
+
+      // remove column in table
+      sql(s"ALTER TABLE $t DROP COLUMN extra")
+
+      // execution should fail with column mismatch
+      checkError(
+        exception = intercept[AnalysisException] { df.collect() },
+        condition = 
"INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS.COLUMNS_MISMATCH",
+        parameters = Map(
+          "tableName" -> "`testcat`.`ns1`.`ns2`.`tbl`",
+          "errors" -> "\n- `extra` STRING is missing"))
+    }
+  }
+
+  test("SPARK-54157: detect column addition after DataFrame analysis") {
+    val t = "testcat.ns1.ns2.tbl"
+    withTable(t) {
+      sql(s"CREATE TABLE $t (id INT, data STRING) USING foo")
+      sql(s"INSERT INTO $t VALUES (1, 'a')")
+
+      // create DataFrame and trigger analysis
+      val df = spark.table(t)
+
+      // add columns to table
+      sql(s"ALTER TABLE $t ADD COLUMN new_col1 INT")
+      sql(s"ALTER TABLE $t ADD COLUMN new_col2 INT")
+
+      // execution should fail with column mismatch
+      checkError(
+        exception = intercept[AnalysisException] { df.collect() },
+        condition = 
"INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS.COLUMNS_MISMATCH",
+        parameters = Map(
+          "tableName" -> "`testcat`.`ns1`.`ns2`.`tbl`",
+          "errors" ->
+            """
+              |- `new_col1` INT has been added
+              |- `new_col2` INT has been added""".stripMargin))
+    }
+  }
+
+  test("SPARK-54157: detect multiple change types after DataFrame analysis") {
+    val t = "testcat.ns1.ns2.tbl"
+    withTable(t) {
+      sql(s"CREATE TABLE $t (col1 INT, col2 STRING, col3 BOOLEAN, col4 STRING) 
USING foo")
+      sql(s"INSERT INTO $t VALUES (1, 'a', true, 'x')")
+
+      // create DataFrame and trigger analysis
+      val df = spark.table(t).select($"col1", $"col2", $"col3", $"col4")
+
+      // make multiple changes in table
+      sql(s"ALTER TABLE $t DROP COLUMN col4")
+      sql(s"ALTER TABLE $t ADD COLUMN col5 INT")
+
+      // execution should fail with column mismatch
+      checkError(
+        exception = intercept[AnalysisException] { df.collect() },
+        condition = 
"INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS.COLUMNS_MISMATCH",
+        parameters = Map(
+          "tableName" -> "`testcat`.`ns1`.`ns2`.`tbl`",
+          "errors" ->
+            """
+              |- `col4` STRING is missing
+              |- `col5` INT has been added""".stripMargin))
+    }
+  }
+
+  test("SPARK-54157: detect nested struct field changes after DataFrame 
analysis") {
+    val t = "testcat.ns1.ns2.tbl"
+    withTable(t) {
+      sql(s"CREATE TABLE $t (id INT, person STRUCT<name: STRING, age: INT>) 
USING foo")
+      sql(s"INSERT INTO $t SELECT 1, named_struct('name', 'Alice', 'age', 30)")
+
+      // create DataFrame and trigger analysis
+      val df = spark.table(t)
+
+      // add nested field to struct column
+      sql(s"ALTER TABLE $t ADD COLUMN person.city STRING")
+
+      // execution should fail with column mismatch
+      checkError(
+        exception = intercept[AnalysisException] { df.collect() },
+        condition = 
"INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS.COLUMNS_MISMATCH",
+        parameters = Map(
+          "tableName" -> "`testcat`.`ns1`.`ns2`.`tbl`",
+          "errors" ->
+            ("\n- `person` type has changed from STRUCT<name: STRING, age: 
INT> " +
+              "to STRUCT<name: STRING, age: INT, city: STRING>")))

Review Comment:
   the error message will be hard to read with super wide or deeply nested 
struct types. I think we should perform the check recursively and point to the 
exact nested fields in the error message.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-54157][SQL] Fix refresh of DSv2 tables in Dataset [spark]

Reply via email to