[
https://issues.apache.org/jira/browse/HUDI-4205?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Ethan Guo updated HUDI-4205:
----------------------------
Description:
Environment: EMR 6.6.0, OSS Spark 3.2.1, Hudi master
Storage: S3
When loading the metadata table in Spark shell using the following code, it
throws NullPointerException.
This also happens for the following combinations: (1) Spark 3.1.3, Hudi 0.11.0
(1) Spark 3.2.1, Hudi 0.11.0
{code:java}
spark.read.format("hudi").load("s3a://<base_path>/.hoodie/metadata/").show
{code}
{code:java}
Caused by: java.lang.NullPointerException
at
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:178)
at
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:167)
at
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:163)
at
org.apache.hudi.HoodieBaseRelation$.$anonfun$createHFileReader$1(HoodieBaseRelation.scala:531)
at
org.apache.hudi.HoodieBaseRelation.$anonfun$createBaseFileReader$1(HoodieBaseRelation.scala:482)
at
org.apache.hudi.HoodieMergeOnReadRDD.readBaseFile(HoodieMergeOnReadRDD.scala:130)
at
org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:100)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750) {code}
{code:java}
Caused by: java.lang.NullPointerException
at
org.apache.hudi.metadata.HoodieBackedTableMetadata.getLogRecordScanner(HoodieBackedTableMetadata.java:484)
at
org.apache.hudi.HoodieMergeOnReadRDD$.scanLog(HoodieMergeOnReadRDD.scala:342)
at
org.apache.hudi.HoodieMergeOnReadRDD$LogFileIterator.<init>(HoodieMergeOnReadRDD.scala:173)
at
org.apache.hudi.HoodieMergeOnReadRDD$RecordMergingFileIterator.<init>(HoodieMergeOnReadRDD.scala:252)
at
org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:101)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748) {code}
Spark shell:
{code:java}
./bin/spark-shell \
--master yarn \
--deploy-mode client \
--driver-memory 20g \
--executor-memory 20g \
--num-executors 2 \
--executor-cores 8 \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.kryoserializer.buffer=256m \
--conf spark.kryoserializer.buffer.max=1024m \
--jars /home/hadoop/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
--conf 'spark.eventLog.enabled=true' --conf
'spark.eventLog.dir=hdfs:///var/log/spark/apps' \
--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
--conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
{code}
was:
Environment: EMR 6.6.0, OSS Spark 3.2.1, Hudi master
Storage: S3
When loading the metadata table in Spark shell using the following code, it
throws NullPointerException.
This also happens for the following combinations: (1) Spark 3.1.3, Hudi 0.11.0
(1) Spark 3.2.1, Hudi 0.11.0
{code:java}
spark.read.format("hudi").load("s3a://<base_path>/.hoodie/metadata/").show
{code}
{code:java}
Caused by: java.lang.NullPointerException
at
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:178)
at
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:167)
at
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:163)
at
org.apache.hudi.HoodieBaseRelation$.$anonfun$createHFileReader$1(HoodieBaseRelation.scala:531)
at
org.apache.hudi.HoodieBaseRelation.$anonfun$createBaseFileReader$1(HoodieBaseRelation.scala:482)
at
org.apache.hudi.HoodieMergeOnReadRDD.readBaseFile(HoodieMergeOnReadRDD.scala:130)
at
org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:100)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750) {code}
Spark shell:
{code:java}
./bin/spark-shell \
--master yarn \
--deploy-mode client \
--driver-memory 20g \
--executor-memory 20g \
--num-executors 2 \
--executor-cores 8 \
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--conf spark.kryoserializer.buffer=256m \
--conf spark.kryoserializer.buffer.max=1024m \
--jars /home/hadoop/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
--conf 'spark.eventLog.enabled=true' --conf
'spark.eventLog.dir=hdfs:///var/log/spark/apps' \
--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
--conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
{code}
> Reading metadata table on S3 using Spark throws NullPointerException
> --------------------------------------------------------------------
>
> Key: HUDI-4205
> URL: https://issues.apache.org/jira/browse/HUDI-4205
> Project: Apache Hudi
> Issue Type: Bug
> Components: metadata, spark
> Affects Versions: 0.11.0
> Reporter: Ethan Guo
> Assignee: Ethan Guo
> Priority: Blocker
> Fix For: 0.12.0
>
>
> Environment: EMR 6.6.0, OSS Spark 3.2.1, Hudi master
> Storage: S3
> When loading the metadata table in Spark shell using the following code, it
> throws NullPointerException.
> This also happens for the following combinations: (1) Spark 3.1.3, Hudi
> 0.11.0 (1) Spark 3.2.1, Hudi 0.11.0
> {code:java}
> spark.read.format("hudi").load("s3a://<base_path>/.hoodie/metadata/").show
> {code}
>
> {code:java}
> Caused by: java.lang.NullPointerException
> at
> org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:178)
> at
> org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:167)
> at
> org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:163)
> at
> org.apache.hudi.HoodieBaseRelation$.$anonfun$createHFileReader$1(HoodieBaseRelation.scala:531)
> at
> org.apache.hudi.HoodieBaseRelation.$anonfun$createBaseFileReader$1(HoodieBaseRelation.scala:482)
> at
> org.apache.hudi.HoodieMergeOnReadRDD.readBaseFile(HoodieMergeOnReadRDD.scala:130)
> at
> org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:100)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> at org.apache.spark.scheduler.Task.run(Task.scala:131)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:750) {code}
> {code:java}
> Caused by: java.lang.NullPointerException
> at
> org.apache.hudi.metadata.HoodieBackedTableMetadata.getLogRecordScanner(HoodieBackedTableMetadata.java:484)
> at
> org.apache.hudi.HoodieMergeOnReadRDD$.scanLog(HoodieMergeOnReadRDD.scala:342)
> at
> org.apache.hudi.HoodieMergeOnReadRDD$LogFileIterator.<init>(HoodieMergeOnReadRDD.scala:173)
> at
> org.apache.hudi.HoodieMergeOnReadRDD$RecordMergingFileIterator.<init>(HoodieMergeOnReadRDD.scala:252)
> at
> org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:101)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> at org.apache.spark.scheduler.Task.run(Task.scala:131)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748) {code}
> Spark shell:
> {code:java}
> ./bin/spark-shell \
> --master yarn \
> --deploy-mode client \
> --driver-memory 20g \
> --executor-memory 20g \
> --num-executors 2 \
> --executor-cores 8 \
> --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
> --conf spark.kryoserializer.buffer=256m \
> --conf spark.kryoserializer.buffer.max=1024m \
> --jars /home/hadoop/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
> --conf 'spark.eventLog.enabled=true' --conf
> 'spark.eventLog.dir=hdfs:///var/log/spark/apps' \
> --conf
> 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
> --conf
> 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
> {code}
>
>
--
This message was sent by Atlassian Jira
(v8.20.7#820007)