This is an automated email from the ASF dual-hosted git repository.
danny0405 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new c51a8cfd5b2 [HUDI-6073] Table create schema should not include
metadata fields (#8450)
c51a8cfd5b2 is described below
commit c51a8cfd5b2dad181039d2a1d1af59ba4f0e737a
Author: Danny Chan <[email protected]>
AuthorDate: Fri Apr 14 17:11:28 2023 +0800
[HUDI-6073] Table create schema should not include metadata fields (#8450)
---
.../apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala | 9 +++++++--
.../test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala | 8 ++++++--
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala
index 09eb9f2944f..eebe3db0be2 100644
---
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala
+++
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.catalog
import org.apache.hudi.DataSourceWriteOptions.OPERATION
import org.apache.hudi.HoodieWriterUtils._
+import org.apache.hudi.avro.HoodieAvroUtils
import org.apache.hudi.common.config.DFSPropertiesConfiguration
import org.apache.hudi.common.model.HoodieTableType
import org.apache.hudi.common.table.HoodieTableConfig.URL_ENCODE_PARTITIONING
@@ -171,6 +172,10 @@ class HoodieCatalogTable(val spark: SparkSession, var
table: CatalogTable) exten
def initHoodieTable(): Unit = {
logInfo(s"Init hoodie.properties for ${table.identifier.unquotedString}")
val (finalSchema, tableConfigs) = parseSchemaAndConfigs()
+ // The TableSchemaResolver#getTableAvroSchemaInternal has a premise that
+ // the table create schema does not include the metadata fields.
+ // When flag includeMetadataFields is false, no metadata fields should be
included in the resolved schema.
+ val dataSchema = removeMetaFields(finalSchema)
table = table.copy(schema = finalSchema)
@@ -187,11 +192,11 @@ class HoodieCatalogTable(val spark: SparkSession, var
table: CatalogTable) exten
HoodieTableMetaClient.withPropertyBuilder()
.fromProperties(properties)
.setDatabaseName(catalogDatabaseName)
-
.setTableCreateSchema(SchemaConverters.toAvroType(finalSchema).toString())
+
.setTableCreateSchema(SchemaConverters.toAvroType(dataSchema).toString())
.initTable(hadoopConf, tableLocation)
} else {
val (recordName, namespace) =
AvroConversionUtils.getAvroRecordNameAndNamespace(table.identifier.table)
- val schema = SchemaConverters.toAvroType(finalSchema, false, recordName,
namespace)
+ val schema = SchemaConverters.toAvroType(dataSchema, nullable = false,
recordName, namespace)
val partitionColumns = if (table.partitionColumnNames.isEmpty) {
null
} else {
diff --git
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
index e975ca8a4b7..4ffcfa062e8 100644
---
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
+++
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala
@@ -30,10 +30,9 @@ import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.CatalogTableType
import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase.getLastCommitMetadata
import org.apache.spark.sql.types._
-import org.junit.jupiter.api.Assertions.assertFalse
+import org.junit.jupiter.api.Assertions.{assertFalse, assertTrue}
import scala.collection.JavaConverters._
-import scala.collection.Seq
class TestCreateTable extends HoodieSparkSqlTestBase {
@@ -86,6 +85,11 @@ class TestCreateTable extends HoodieSparkSqlTestBase {
assertResult(tableName)(tableConfig.getTableName)
assertFalse(tableConfig.contains(OPERATION.key()))
+ val schemaOpt = tableConfig.getTableCreateSchema
+ assertTrue(schemaOpt.isPresent, "Table create schema should be persisted")
+ assertFalse(schemaOpt.get().getFields.asScala.exists(f =>
HoodieRecord.HOODIE_META_COLUMNS.contains(f.name())),
+ "Table create schema should not include metadata fields")
+
spark.sql("use default")
}