This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch release-1.2.0 in repository https://gitbox.apache.org/repos/asf/hudi.git
commit bf5f4e3eb8b53e2c30a03ddad6edfb805d35f03f Author: Prashant Wason <[email protected]> AuthorDate: Wed Apr 29 19:40:56 2026 -0700 fix: filter spark.sql.* properties in SparkCatalogMetaStoreClient.toCatalogTable (#18654) Spark's HiveExternalCatalog.alterTable / createTable rejects table properties whose keys start with "spark.sql." with: AnalysisException: Cannot persist <table> into Hive metastore as table property keys may not start with 'spark.sql.': [spark.sql.create.version, spark.sql.sources.provider, spark.sql.sources.schema.partCol.0, spark.sql.sources.schema.numParts, spark.sql.sources.schema.numPartCols, spark.sql.sources.schema.part.0] These keys are reserved for Spark's internal use (provider, schema parts, create version) and Spark itself writes them when persisting a CatalogTable. On the way back through getTable they appear in the parameters map, and toCatalogTable currently passes them straight through. The next alter_table call then trips the validation and the entire HoodieHiveSyncClient flow fails - no actual sync happens. Strip "spark.sql.*" keys in toCatalogTable before constructing the CatalogTable. Spark re-derives and writes them from the CatalogTable, so dropping them on the way in is safe. Co-authored-by: Claude Opus 4.7 <[email protected]> --- .../apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala index d238341bed46..bf73a5bb064f 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hive/SparkCatalogMetaStoreClient.scala @@ -302,6 +302,14 @@ class SparkCatalogMetaStoreClient(syncConfig: HiveSyncConfig) val dataFields = cols.map(fs => StructField(fs.getName, CatalystSqlParser.parseDataType(fs.getType), nullable = true, Metadata.empty)) val partitionFields = partCols.map(fs => StructField(fs.getName, CatalystSqlParser.parseDataType(fs.getType), nullable = true, Metadata.empty)) + // Strip "spark.sql.*" properties before handing off to Spark's external catalog. + // HiveExternalCatalog.alterTable / createTable rejects such keys ("Cannot persist ... + // table property keys may not start with 'spark.sql.'") because they are reserved for + // Spark's internal use (provider, schema parts, create version). Spark re-derives and + // writes these from the CatalogTable itself, so dropping them on the way in is safe. + val tableProperties = Option(table.getParameters).map(_.asScala.toMap).getOrElse(Map.empty) + .filterNot { case (k, _) => k.startsWith("spark.sql.") } + CatalogTable( identifier = TableIdentifier(tbl, Some(db)), tableType = if ("EXTERNAL_TABLE".equalsIgnoreCase(table.getTableType)) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED, @@ -315,7 +323,7 @@ class SparkCatalogMetaStoreClient(syncConfig: HiveSyncConfig) schema = StructType(dataFields ++ partitionFields), provider = Some("hudi"), partitionColumnNames = partCols.map(_.getName), - properties = Option(table.getParameters).map(_.asScala.toMap).getOrElse(Map.empty)) + properties = tableProperties) } private def fromCatalogTable(table: CatalogTable): Table = {
