This is an automated email from the ASF dual-hosted git repository.

yihua pushed a commit to branch release-1.2.0
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit bf5f4e3eb8b53e2c30a03ddad6edfb805d35f03f
Author: Prashant Wason <[email protected]>
AuthorDate: Wed Apr 29 19:40:56 2026 -0700

    fix: filter spark.sql.* properties in 
SparkCatalogMetaStoreClient.toCatalogTable (#18654)
    
    Spark's HiveExternalCatalog.alterTable / createTable rejects table
    properties whose keys start with "spark.sql." with:
    
      AnalysisException: Cannot persist <table> into Hive metastore as table
      property keys may not start with 'spark.sql.': [spark.sql.create.version,
      spark.sql.sources.provider, spark.sql.sources.schema.partCol.0,
      spark.sql.sources.schema.numParts, spark.sql.sources.schema.numPartCols,
      spark.sql.sources.schema.part.0]
    
    These keys are reserved for Spark's internal use (provider, schema parts,
    create version) and Spark itself writes them when persisting a CatalogTable.
    On the way back through getTable they appear in the parameters map, and
    toCatalogTable currently passes them straight through. The next alter_table
    call then trips the validation and the entire HoodieHiveSyncClient flow
    fails - no actual sync happens.
    
    Strip "spark.sql.*" keys in toCatalogTable before constructing the
    CatalogTable. Spark re-derives and writes them from the CatalogTable, so
    dropping them on the way in is safe.
    
    Co-authored-by: Claude Opus 4.7 <[email protected]>
---
 .../apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala
index d238341bed46..bf73a5bb064f 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala
@@ -302,6 +302,14 @@ class SparkCatalogMetaStoreClient(syncConfig: 
HiveSyncConfig)
     val dataFields = cols.map(fs => StructField(fs.getName, 
CatalystSqlParser.parseDataType(fs.getType), nullable = true, Metadata.empty))
     val partitionFields = partCols.map(fs => StructField(fs.getName, 
CatalystSqlParser.parseDataType(fs.getType), nullable = true, Metadata.empty))
 
+    // Strip "spark.sql.*" properties before handing off to Spark's external 
catalog.
+    // HiveExternalCatalog.alterTable / createTable rejects such keys ("Cannot 
persist ...
+    // table property keys may not start with 'spark.sql.'") because they are 
reserved for
+    // Spark's internal use (provider, schema parts, create version). Spark 
re-derives and
+    // writes these from the CatalogTable itself, so dropping them on the way 
in is safe.
+    val tableProperties = 
Option(table.getParameters).map(_.asScala.toMap).getOrElse(Map.empty)
+      .filterNot { case (k, _) => k.startsWith("spark.sql.") }
+
     CatalogTable(
       identifier = TableIdentifier(tbl, Some(db)),
       tableType = if ("EXTERNAL_TABLE".equalsIgnoreCase(table.getTableType)) 
CatalogTableType.EXTERNAL else CatalogTableType.MANAGED,
@@ -315,7 +323,7 @@ class SparkCatalogMetaStoreClient(syncConfig: 
HiveSyncConfig)
       schema = StructType(dataFields ++ partitionFields),
       provider = Some("hudi"),
       partitionColumnNames = partCols.map(_.getName),
-      properties = 
Option(table.getParameters).map(_.asScala.toMap).getOrElse(Map.empty))
+      properties = tableProperties)
   }
 
   private def fromCatalogTable(table: CatalogTable): Table = {

Reply via email to