[ https://issues.apache.org/jira/browse/HUDI-453?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
lamber-ken resolved HUDI-453. ----------------------------- Resolution: Fixed Fixed at masterĀ 58c5bed40a76189a28dd72bbd67fcefaac587184 > Throw failed to archive commits error when writing data to MOR/COW table > ------------------------------------------------------------------------ > > Key: HUDI-453 > URL: https://issues.apache.org/jira/browse/HUDI-453 > Project: Apache Hudi (incubating) > Issue Type: Bug > Components: Writer Core > Reporter: lamber-ken > Assignee: lamber-ken > Priority: Major > Labels: pull-request-available > Fix For: 0.5.1 > > Time Spent: 20m > Remaining Estimate: 0h > > Throw failed to archive commits error when writing data to table, here areĀ > reproduce steps. > *1, Build from latest source* > {code:java} > mvn clean package -DskipTests -DskipITs -Dcheckstyle.skip=true -Drat.skip=true > {code} > *2, Write Data* > {code:java} > export SPARK_HOME=/work/BigData/install/spark/spark-2.3.3-bin-hadoop2.6 > ${SPARK_HOME}/bin/spark-shell --jars `ls > packaging/hudi-spark-bundle/target/hudi-spark-bundle-*.*.*-SNAPSHOT.jar` > --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' > import org.apache.spark.sql.SaveMode._ > var datas = List("{ \"name\": \"kenken\", \"ts\": 1574297893836, \"age\": 12, > \"location\": \"latitude\"}") > val df = spark.read.json(spark.sparkContext.parallelize(datas, 2)) > df.write.format("org.apache.hudi"). > option("hoodie.insert.shuffle.parallelism", "10"). > option("hoodie.upsert.shuffle.parallelism", "10"). > option("hoodie.delete.shuffle.parallelism", "10"). > option("hoodie.bulkinsert.shuffle.parallelism", "10"). > option("hoodie.datasource.write.recordkey.field", "name"). > option("hoodie.datasource.write.partitionpath.field", "location"). > option("hoodie.datasource.write.precombine.field", "ts"). > option("hoodie.table.name", "hudi_mor_table"). > mode(Overwrite). > save("file:///tmp/hudi_mor_table") > {code} > *3, Append Data* > {code:java} > df.write.format("org.apache.hudi"). > option("hoodie.insert.shuffle.parallelism", "10"). > option("hoodie.upsert.shuffle.parallelism", "10"). > option("hoodie.delete.shuffle.parallelism", "10"). > option("hoodie.bulkinsert.shuffle.parallelism", "10"). > option("hoodie.datasource.write.recordkey.field", "name"). > option("hoodie.datasource.write.partitionpath.field", "location"). > option("hoodie.datasource.write.precombine.field", "ts"). > option("hoodie.keep.max.commits", "5"). > option("hoodie.keep.min.commits", "4"). > option("hoodie.cleaner.commits.retained", "3"). > option("hoodie.table.name", "hudi_mor_table"). > mode(Append). > save("file:///tmp/hudi_mor_table") > {code} > *4, Repeat about six times Append Data operation(above), will get the > stackstrace* > {code:java} > 19/12/23 01:30:48 ERROR HoodieCommitArchiveLog: Failed to archive commits, > .commit file: 20191224004558.clean.requested > java.io.IOException: Not an Avro data file > at org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:50) > at > org.apache.hudi.common.util.AvroUtils.deserializeAvroMetadata(AvroUtils.java:147) > at > org.apache.hudi.common.util.CleanerUtils.getCleanerPlan(CleanerUtils.java:88) > at > org.apache.hudi.io.HoodieCommitArchiveLog.convertToAvroRecord(HoodieCommitArchiveLog.java:294) > at > org.apache.hudi.io.HoodieCommitArchiveLog.archive(HoodieCommitArchiveLog.java:253) > at > org.apache.hudi.io.HoodieCommitArchiveLog.archiveIfRequired(HoodieCommitArchiveLog.java:122) > at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:562) > at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:523) > at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:514) > at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:159) > at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:91) > at > org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152) > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) > at > org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80) > at > org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80) > at > org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:656) > at > org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:656) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77) > at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:656) > at > org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273) > at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267) > at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:225) > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)