Hi @mayu1,
This issue has been fix in master branch <https://github.com/apache/incubator-hudi>, you can checkout it, build source and continue your test program. Looking forward to your feedback on whether the problem has been solved. best, lamber-ken At 2019-12-26 06:43:55, "Vinoth Chandar" <[email protected]> wrote: >Filed HUDI-468 - Not an avro data file - error while archiving post >rename() change <https://issues.apache.org/jira/browse/HUDI-468> to track >this > >On Mon, Dec 23, 2019 at 11:40 PM [email protected] <[email protected]> >wrote: > >> Thank you, I have replaced it with hubi-spark-bundle-0.5.0-incubating.jar, >> and the program seems to be stable. >> >> ------------------------------ >> [email protected] >> >> >> *发件人:* lamberken <[email protected]> >> *发送时间:* 2019-12-24 11:24 >> *收件人:* dev <[email protected]> >> *主题:* Re:How to write a performance test program >> >> Hi @mayu1, >> >> I guess you used the latest master branch, this bug seems happened after >> HUDI-398 merged. >> I met the same exception, and I am trying to fix it [1]. >> >> You can try to build source before that commit, then continue your test. >> >> [1] https://issues.apache.org/jira/browse/HUDI-453 >> >> best, >> lamber-ken >> >> >> >> At 2019-12-24 11:11:41, "[email protected]" <[email protected]> wrote: >> >hello! >> >I want to modify the quickstart program for performance testing and >> >generate a dataset of ten million rows. However, the program will report an >> >error after running it multiple times. >> > >> >error: >> >Exception in thread "main" org.apache.hudi.exception.HoodieCommitException: >> >Failed to archive commits >> >at >> >org.apache.hudi.io.HoodieCommitArchiveLog.archive(HoodieCommitArchiveLog.java:266) >> >at >> >org.apache.hudi.io.HoodieCommitArchiveLog.archiveIfRequired(HoodieCommitArchiveLog.java:122) >> >at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:562) >> >at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:523) >> >at org.apache.hudi.HoodieWriteClient.commit(HoodieWriteClient.java:514) >> >at >> >org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:152) >> >at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:91) >> >at >> >org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45) >> >at >> >org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70) >> >at >> >org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68) >> >at >> >org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86) >> >at >> >org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131) >> >at >> >org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) >> >at >> >org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155) >> >at >> >org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) >> >at >> >org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152) >> >at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) >> >at >> >org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80) >> >at >> >org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80) >> >at >> >org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676) >> >at >> >org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676) >> >at >> >org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78) >> >at >> >org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125) >> >at >> >org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73) >> >at >> >org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676) >> >at >> >org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285) >> >at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271) >> >at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229) >> >at HudiUpdate$.main(HudiUpdate.scala:38) >> >at HudiUpdate.main(HudiUpdate.scala) >> >at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >> >at >> >sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) >> >at >> >sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) >> >at java.lang.reflect.Method.invoke(Method.java:498) >> >at >> >org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) >> >at >> >org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:845) >> >at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161) >> >at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184) >> >at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86) >> >at >> >org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920) >> >at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929) >> >at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) >> >Caused by: java.io.IOException: Not an Avro data file >> >at org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:50) >> >at >> >org.apache.hudi.common.util.AvroUtils.deserializeAvroMetadata(AvroUtils.java:147) >> >at >> >org.apache.hudi.common.util.CleanerUtils.getCleanerPlan(CleanerUtils.java:88) >> >at >> >org.apache.hudi.io.HoodieCommitArchiveLog.convertToAvroRecord(HoodieCommitArchiveLog.java:294) >> >at >> >org.apache.hudi.io.HoodieCommitArchiveLog.archive(HoodieCommitArchiveLog.java:253) >> >... 41 more >> > >> >my program: >> >import org.apache.spark.sql.SQLContext >> >import org.apache.spark.{SparkConf, SparkContext} >> > >> >object HudiDataGen { >> > def main(args: Array[String]): Unit = { >> > import org.apache.hudi.DataSourceWriteOptions._ >> > import org.apache.hudi.QuickstartUtils._ >> > import org.apache.hudi.config.HoodieWriteConfig._ >> > import org.apache.spark.sql.SaveMode._ >> > >> > import scala.collection.JavaConversions._ >> > >> > //初始化 >> > val conf = new SparkConf().setAppName("HudiTest") >> > // .setMaster("local") >> > conf.set("spark.serializer", >> > "org.apache.spark.serializer.KryoSerializer") //使用Kryo序列化库 >> > val sc = new SparkContext(conf) >> > val spark = new SQLContext(sc) >> > >> > //设置表名、基本路径和数据生成器来为本指南生成记录。 >> > val tableName = "hudi_cow_table" >> > val basePath = "hdfs://172.16.44.28:8020/flink/hudi" >> > // val basePath = "file:///e:/hudi_cow_table" >> > val dataGen = new DataGenerator >> > >> > //生成一些新的行程样本,将其加载到DataFrame中,然后将DataFrame写入Hudi数据集中,如下所示。 >> > val inserts = convertToStringList(dataGen.generateInserts(1000000)) >> > // println("insert:"+System.currentTimeMillis()) >> > val df = spark.read.json(spark.sparkContext.parallelize(inserts, 8)) >> > df.write.format("org.apache.hudi"). >> > options(getQuickstartWriteConfigs). >> > option(PRECOMBINE_FIELD_OPT_KEY, "ts"). >> > option(RECORDKEY_FIELD_OPT_KEY, "uuid"). >> > option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). >> > option(TABLE_NAME, tableName). >> > mode(Append). >> > save(basePath); >> > println("finish") >> > } >> >} >> > >> > >> >[email protected] >> >>
