Re: [PR] [HUDI-4588][HUDI-4472] Addressing schema handling issues in the write path [hudi]

via GitHub Sun, 12 Oct 2025 18:37:57 -0700


nsivabalan commented on code in PR #6358:
URL: https://github.com/apache/hudi/pull/6358#discussion_r2425086622



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala:
##########
@@ -429,37 +563,41 @@ object HoodieSparkSqlWriter {
     }
   }
 
-  /**
-   * Checks if schema needs upgrade (if incoming record's write schema is old 
while table schema got evolved).
-   *
-   * @param fs           instance of FileSystem.
-   * @param basePath     base path.
-   * @param sparkContext instance of spark context.
-   * @param schema       incoming record's schema.
-   * @return Pair of(boolean, table schema), where first entry will be true 
only if schema conversion is required.
-   */
-  def getLatestTableSchema(fs: FileSystem, basePath: Path, sparkContext: 
SparkContext): Option[Schema] = {
-    if (FSUtils.isTableExists(basePath.toString, fs)) {
-      val tableMetaClient = HoodieTableMetaClient.builder
-        .setConf(sparkContext.hadoopConfiguration)
-        .setBasePath(basePath.toString)
-        .build()
-      val tableSchemaResolver = new TableSchemaResolver(tableMetaClient)
-
-      
toScalaOption(tableSchemaResolver.getTableAvroSchemaFromLatestCommit(false))
-    } else {
-      None
+  private def registerAvroSchemasWithKryo(sparkContext: SparkContext, 
targetAvroSchemas: Schema*): Unit = {
+    sparkContext.getConf.registerAvroSchemas(targetAvroSchemas: _*)
+  }
+
+  private def getLatestTableSchema(spark: SparkSession,
+                                   tableBasePath: Path,
+                                   tableId: TableIdentifier,
+                                   hadoopConf: Configuration): Option[Schema] 
= {
+    val fs = tableBasePath.getFileSystem(hadoopConf)
+    val latestTableSchemaFromCommitMetadata =
+      if (FSUtils.isTableExists(tableBasePath.toString, fs)) {
+        val tableMetaClient = HoodieTableMetaClient.builder
+          .setConf(hadoopConf)
+          .setBasePath(tableBasePath.toString)
+          .build()
+        val tableSchemaResolver = new TableSchemaResolver(tableMetaClient)
+        
toScalaOption(tableSchemaResolver.getTableAvroSchemaFromLatestCommit(false))
+      } else {
+        None
+      }
+
+    latestTableSchemaFromCommitMetadata.orElse {
+      getCatalogTable(spark, tableId).map { catalogTable =>

Review Comment:
   not sure why we added this fix to poll the catalog if its table does not 
have any valid commits. 
   we should just remove this. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-4588][HUDI-4472] Addressing schema handling issues in the write path [hudi]

Reply via email to