rubenssoto commented on issue #2515: URL: https://github.com/apache/hudi/issues/2515#issuecomment-773020297
I made a new try...so... emr-6.2.0 spark 3.0.1 git clone https://github.com/apache/hudi.git && cd hudi mvn clean package -DskipTests -Dspark3 spark submit: `spark-submit --deploy-mode cluster --conf spark.executor.cores=5 --conf spark.executor.memoryOverhead=2000 --conf spark.executor.memory=32g --conf spark.yarn.maxAppAttempts=1 --conf spark.dynamicAllocation.maxExecutors=4 --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --packages org.apache.spark:spark-avro_2.12:2.4.4 --jars s3://dl/lib/spark-daria_2.12-0.38.2.jar,s3://ze-data-etl/lib/hudi-spark-bundle_2.12-0.8.0-SNAPSHOT.jar --class TableProcessorWrapper s3://dl//code/projects/data_projects/batch_processor_engine/batch-processor-engine_2.12-3.0.1_0.5.jar courier_api_group01` ``` trait SparkSessionWrapper extends Serializable { val spark = { SparkSession.builder .appName("Batch Processor Engine") .config( "spark.jars.packages", "org.apache.spark:spark-avro_2.12:2.4.4" ) .config("spark.jars", "s3://dl/lib/hudi-spark-bundle_2.12-0.8.0-SNAPSHOT.jar") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .enableHiveSupport() .getOrCreate() } } ``` Same Error :( ``` Error on Table: deliveryman_route_history, Error Message: org.apache.hudi.exception.HoodieCommitException: Failed to archive commits java.lang.Exception: Error on Table: deliveryman_route_history, Error Message: org.apache.hudi.exception.HoodieCommitException: Failed to archive commits at jobs.TableProcessor.start(TableProcessor.scala:102) at TableProcessorWrapper$.$anonfun$main$2(TableProcessorWrapper.scala:23) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659) at scala.util.Success.$anonfun$map$1(Try.scala:255) at scala.util.Success.map(Try.scala:213) at scala.concurrent.Future.$anonfun$map$1(Future.scala:292) at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33) at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64) at java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1402) at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175) ``` ``` 21/02/04 04:05:40 ERROR HoodieTimelineArchiveLog: Failed to archive commits, .commit file: 20210203023129.commit.requested java.lang.NullPointerException: null of string of map of union in field extraMetadata of org.apache.hudi.avro.model.HoodieCommitMetadata of union in field hoodieCommitMetadata of org.apache.hudi.avro.model.HoodieArchivedMetaEntry at org.apache.avro.generic.GenericDatumWriter.npe(GenericDatumWriter.java:145) at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:139) at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75) at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:62) at org.apache.hudi.common.table.log.block.HoodieAvroDataBlock.serializeRecords(HoodieAvroDataBlock.java:106) at org.apache.hudi.common.table.log.block.HoodieDataBlock.getContentBytes(HoodieDataBlock.java:97) at org.apache.hudi.common.table.log.HoodieLogFormatWriter.appendBlocks(HoodieLogFormatWriter.java:164) at org.apache.hudi.common.table.log.HoodieLogFormatWriter.appendBlock(HoodieLogFormatWriter.java:142) at org.apache.hudi.table.HoodieTimelineArchiveLog.writeToFile(HoodieTimelineArchiveLog.java:361) at org.apache.hudi.table.HoodieTimelineArchiveLog.archive(HoodieTimelineArchiveLog.java:311) at org.apache.hudi.table.HoodieTimelineArchiveLog.archiveIfRequired(HoodieTimelineArchiveLog.java:138) at org.apache.hudi.client.AbstractHoodieWriteClient.postCommit(AbstractHoodieWriteClient.java:426) at org.apache.hudi.client.AbstractHoodieWriteClient.commitStats(AbstractHoodieWriteClient.java:188) at org.apache.hudi.client.SparkRDDWriteClient.commit(SparkRDDWriteClient.java:110) at org.apache.hudi.HoodieSparkSqlWriter$.commitAndPerformPostOperations(HoodieSparkSqlWriter.scala:443) at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:218) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:134) at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68) at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90) at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180) at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:124) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:123) at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:963) at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:104) at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:227) at org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:107) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:132) at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:104) at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:227) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:132) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:131) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68) at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:963) at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:415) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:399) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288) at hudiwriter.HudiWriter.merge(HudiWriter.scala:72) at hudiwriter.HudiContext.writeToHudi(HudiContext.scala:35) at jobs.TableProcessor.start(TableProcessor.scala:84) at TableProcessorWrapper$.$anonfun$main$2(TableProcessorWrapper.scala:23) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659) at scala.util.Success.$anonfun$map$1(Try.scala:255) at scala.util.Success.map(Try.scala:213) at scala.concurrent.Future.$anonfun$map$1(Future.scala:292) at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33) at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64) at java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1402) at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175) Caused by: java.lang.NullPointerException at org.apache.avro.io.Encoder.writeString(Encoder.java:121) at org.apache.avro.generic.GenericDatumWriter.writeString(GenericDatumWriter.java:267) at org.apache.avro.generic.GenericDatumWriter.writeString(GenericDatumWriter.java:262) at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:128) at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75) at org.apache.avro.generic.GenericDatumWriter.writeMap(GenericDatumWriter.java:234) at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:121) at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75) at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:125) at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75) at org.apache.avro.generic.GenericDatumWriter.writeField(GenericDatumWriter.java:166) at org.apache.avro.generic.GenericDatumWriter.writeRecord(GenericDatumWriter.java:156) at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:118) at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75) at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:125) at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:75) at org.apache.avro.generic.GenericDatumWriter.writeField(GenericDatumWriter.java:166) at org.apache.avro.generic.GenericDatumWriter.writeRecord(GenericDatumWriter.java:156) at org.apache.avro.generic.GenericDatumWriter.writeWithoutConversion(GenericDatumWriter.java:118) ... 59 more ``` Map(hoodie.datasource.hive_sync.database -> raw_courier_api_hudi, hoodie.parquet.small.file.limit -> 67108864, hoodie.copyonwrite.record.size.estimate -> 1024, hoodie.datasource.write.precombine.field -> LineCreatedTimestamp, hoodie.datasource.hive_sync.partition_fields -> partitionpath, hoodie.datasource.hive_sync.partition_extractor_class -> org.apache.hudi.hive.MultiPartKeysValueExtractor, hoodie.parquet.max.file.size -> 134217728, hoodie.parquet.block.size -> 67108864, hoodie.datasource.hive_sync.table -> customer_notification, hoodie.datasource.write.operation -> upsert, hoodie.datasource.hive_sync.enable -> true, hoodie.datasource.write.recordkey.field -> id, hoodie.table.name -> customer_notification, hoodie.datasource.hive_sync.jdbcurl -> jdbc:hive2://ip-10-0-26-106.us-west-2.compute.internal:10000, hoodie.datasource.write.hive_style_partitioning -> true, hoodie.consistency.check.enabled -> true, hoodie.datasource.write.table.name -> customer_notification, hoodie.datasource.write.keygenerator.class -> org.apache.hudi.keygen.SimpleKeyGenerator, hoodie.upsert.shuffle.parallelism -> 50) Im using avro org.apache.spark:spark-avro_2.12:2.4.4 in spark 3, it could be the problem? I update to spark-avro_2.12-3.0.1.jar and the problem was not solved. Do I need to recreate the tables? thank you so much ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
