[
https://issues.apache.org/jira/browse/HUDI-2905?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Manoj Govindassamy updated HUDI-2905:
-------------------------------------
Priority: Blocker (was: Critical)
> Insert crashes in MOR table with NullPointerException from HoodieMergeHandle
> ----------------------------------------------------------------------------
>
> Key: HUDI-2905
> URL: https://issues.apache.org/jira/browse/HUDI-2905
> Project: Apache Hudi
> Issue Type: Task
> Reporter: Manoj Govindassamy
> Assignee: Manoj Govindassamy
> Priority: Blocker
> Fix For: 0.10.0
>
>
> Running Hoodie integration test suite with a MOR table type sometimes crashes
> with the following stack trace
> {noformat}
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
> at
> org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:35)
> at
> org.apache.spark.api.java.JavaDoubleRDD.sum(JavaDoubleRDD.scala:165)
> at
> org.apache.hudi.utilities.deltastreamer.DeltaSync.writeToSink(DeltaSync.java:519)
> at
> org.apache.hudi.utilities.deltastreamer.DeltaSync.syncOnce(DeltaSync.java:306)
> at
> org.apache.hudi.integ.testsuite.HoodieDeltaStreamerWrapper.upsert(HoodieDeltaStreamerWrapper.java:44)
> at
> org.apache.hudi.integ.testsuite.HoodieDeltaStreamerWrapper.insert(HoodieDeltaStreamerWrapper.java:48)
> at
> org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter.insert(HoodieTestSuiteWriter.java:166)
> at
> org.apache.hudi.integ.testsuite.dag.nodes.InsertNode.ingest(InsertNode.java:70)
> at
> org.apache.hudi.integ.testsuite.dag.nodes.InsertNode.execute(InsertNode.java:53)
> at
> org.apache.hudi.integ.testsuite.dag.scheduler.DagScheduler.executeNode(DagScheduler.java:139)
> ... 6 more
> Caused by: org.apache.hudi.exception.HoodieUpsertException: Error upserting
> bucketType UPDATE for partition :33
> at
> org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpsertPartition(BaseSparkCommitActionExecutor.java:320)
> at
> org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleInsertPartition(BaseSparkCommitActionExecutor.java:326)
> at
> org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.lambda$execute$ecf5068c$1(BaseSparkCommitActionExecutor.java:174)
> at
> org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1(JavaRDDLike.scala:102)
> at
> org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1$adapted(JavaRDDLike.scala:102)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:915)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:915)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
> at
> org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:386){noformat}
> Test properties
> {noformat}
> hoodie.insert.shuffle.parallelism=100
> hoodie.upsert.shuffle.parallelism=100
> hoodie.bulkinsert.shuffle.parallelism=100hoodie.deltastreamer.source.test.num_partitions=100
> hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false
> hoodie.deltastreamer.source.test.max_unique_records=100000000
> hoodie.embed.timeline.server=false
> hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelectorhoodie.datasource.hive_sync.skip_ro_suffix=truehoodie.datasource.write.recordkey.field=_row_key
> hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator
> hoodie.datasource.write.partitionpath.field=timestamphoodie.clustering.plan.strategy.sort.columns=_row_key
> hoodie.clustering.plan.strategy.daybased.lookback.partitions=0
> hoodie.clustering.inline.max.commits=1hoodie.deltastreamer.source.dfs.root=s3a://dl-scale-test/manoj/010RC2/integration-test-large-scale/slong/mor/input
> hoodie.deltastreamer.schemaprovider.target.schema.file=file:/home/hadoop/staging/source.avsc
> hoodie.deltastreamer.schemaprovider.source.schema.file=file:/home/hadoop/staging/source.avsc
> hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP
> hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/ddhoodie.datasource.hive_sync.database=testdb{noformat}
> {noformat}
> /home/hadoop/spark-3.2.0-bin-hadoop3.2/bin/spark-submit \
> --packages org.apache.spark:spark-avro_2.12:3.2.0 \
> --conf spark.task.cpus=1 \
> --conf spark.executor.cores=1 \
> --conf spark.task.maxFailures=100 \
> --conf spark.memory.fraction=0.4 \
> --conf spark.rdd.compress=true \
> --conf spark.kryoserializer.buffer.max=2000m \
> --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
> --conf spark.memory.storageFraction=0.1 \
> --conf spark.shuffle.service.enabled=true \
> --conf spark.sql.hive.convertMetastoreParquet=false \
> --conf spark.driver.maxResultSize=12g \
> --conf spark.executor.heartbeatInterval=120s \
> --conf spark.network.timeout=600s \
> --conf spark.yarn.max.executor.failures=10 \
> --conf spark.sql.catalogImplementation=hive \
> --conf
> spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain
> \
> --conf spark.hadoop.fs.s3a.connection.maximum=500 \
> --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
> /home/hadoop/staging/hudi-integ-test-bundle-0.10.0-rc2.jar \
> --source-ordering-field test_suite_source_ordering_field \
> --use-deltastreamer \
> --target-base-path
> s3a://dl-scale-test/manoj/010RC2/integration-test-large-scale/slong/mor/output
> \
> --input-base-path
> s3a://dl-scale-test/manoj/010RC2/integration-test-large-scale/slong/mor/input
> \
> --target-table table1 \
> --props file:/home/hadoop/staging/MERGE_ON_READ-true-test-slong.properties \
> --schemaprovider-class
> org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
> --source-class org.apache.hudi.utilities.sources.AvroDFSSource \
> --input-file-size 125829120 \
> --workload-yaml-path file:/home/hadoop/staging/MERGE_ON_READ-true-slong.yaml \
> --workload-generator-classname
> org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
> --table-type MERGE_ON_READ \
> --compact-scheduling-minshare 1 \
> --clean-input \
> --clean-output{noformat}
--
This message was sent by Atlassian Jira
(v8.20.1#820001)