wang168003 opened a new issue, #8341: URL: https://github.com/apache/hudi/issues/8341
**_Tips before filing an issue_** 1. As is proposed before by another guy, there is a bug(https://issues.apache.org/jira/browse/HUDI-4588) when upsert records less of some columns which already exist in hudi table. So I make a test in the latest version 0.13.0. 2. First I wrote records have two Column: ``` import org.apache.hudi.QuickstartUtils._ import scala.collection.JavaConversions._ import org.apache.spark.sql.SaveMode._ import org.apache.hudi.DataSourceReadOptions._ import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.config.HoodieWriteConfig._ import org.apache.hudi.common.model.HoodieRecord val tableName = "dwd_hudi_test" val basePath = "s3://xxx/test" case class userinfo(username:String,age:Int) val df = spark.sparkContext.parallelize(Seq(userinfo("gaolei",22),userinfo("limeii",31))).toDF df.write.format("hudi") .options(getQuickstartWriteConfigs) .option(PRECOMBINE_FIELD_OPT_KEY, "username") .option(RECORDKEY_FIELD_OPT_KEY, "username") .option(TABLE_NAME, tableName) .option("hoodie.datasource.hive_sync.database","dw") .option("hoodie.datasource.hive_sync.table",tableName) .option("hoodie.datasource.hive_sync.enable","true") .mode(Overwrite) .save(basePath) ``` 3. then I append records got 3 columns,2 of them is the same as before. ``` case class userinfo3(username:String,age:Int,addr:String) val df = spark.sparkContext.parallelize(Seq(userinfo3("wanglei",25,"123"),userinfo3("zhangmeii",31,"sdf"))).toDF df.write.format("hudi") .options(getQuickstartWriteConfigs) .option(PRECOMBINE_FIELD_OPT_KEY, "username") .option(RECORDKEY_FIELD_OPT_KEY, "username") .option(TABLE_NAME, tableName) .option("hoodie.schema.on.read.enable","true") .option("hoodie.datasource.hive_sync.database","dw") .option("hoodie.datasource.hive_sync.table",tableName) .option("hoodie.datasource.hive_sync.enable","true") .mode(Append) .save(basePath) ``` 4. and in the end, I append records got 2 columns just like step 2: ``` val df = spark.sparkContext.parallelize(Seq(userinfo("hanlei",22),userinfo("luxin",31))).toDF df.write.format("hudi") .options(getQuickstartWriteConfigs) .option(PRECOMBINE_FIELD_OPT_KEY, "username") .option(RECORDKEY_FIELD_OPT_KEY, "username") .option(TABLE_NAME, tableName) .option("hoodie.datasource.hive_sync.database","dw") .option("hoodie.datasource.hive_sync.table",tableName) .option("hoodie.datasource.hive_sync.enable","true") .option("hoodie.schema.on.read.enable","true") .mode(Append) .save(basePath) ``` **Expected behavior** I thought this bug has been fixed is version 0.13.0, so I expeted the last two records could be wrote into hudi table successfully . But I got exception instead. So I don't known why this would happen. May anyone could help me? : ) (And this is my first issue in github) **Environment Description** * Hudi version : 0.13 * Spark version : 3.3.1 * Hive version : 3.1.3 * Hadoop version : 3.2.1 * Storage (HDFS/S3/GCS..) : S3 * Running on Docker? (yes/no) : no **Additional context** Add any other context about the problem here. **Stacktrace** ``` ERROR HoodieSparkSqlWriter$: Incoming batch schema is not compatible with the table's one. Incoming schema { "type" : "record", "name" : "dwd_hudi_test_gewang_record", "namespace" : "hoodie.dwd_hudi_test_gewang", "fields" : [ { "name" : "username", "type" : [ "null", "string" ], "default" : null }, { "name" : "age", "type" : "int" } ] } Incoming schema (canonicalized) { "type" : "record", "name" : "dwd_hudi_test_gewang_record", "namespace" : "hoodie.dwd_hudi_test_gewang", "fields" : [ { "name" : "username", "type" : [ "null", "string" ], "default" : null }, { "name" : "age", "type" : "int" } ] } Table's schema { "type" : "record", "name" : "dwd_hudi_test_gewang_record", "namespace" : "hoodie.dwd_hudi_test_gewang", "fields" : [ { "name" : "username", "type" : [ "null", "string" ], "default" : null }, { "name" : "age", "type" : "int" }, { "name" : "addr", "type" : [ "null", "string" ], "default" : null } ] } org.apache.hudi.exception.SchemaCompatibilityException: Incoming batch schema is not compatible with the table's one at org.apache.hudi.HoodieSparkSqlWriter$.deduceWriterSchema(HoodieSparkSqlWriter.scala:487) at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:305) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:150) at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73) at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:103) at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107) at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:224) at org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:114) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$7(SQLExecution.scala:139) at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107) at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:224) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:139) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:245) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:138) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:100) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:96) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:615) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:177) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:615) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:591) at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:96) at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:83) at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:81) at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:124) at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860) at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390) at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239) ... 59 elided .``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
