[ 
https://issues.apache.org/jira/browse/HUDI-4205?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ethan Guo updated HUDI-4205:
----------------------------
    Description: 
Environment: EMR 6.6.0, OSS Spark 3.2.1, Hudi master

Storage: S3

When loading the metadata table in Spark shell using the following code, it 
throws NullPointerException.  In this case, the metadata table has the base 
files in HFile format.

This also happens for the following combinations: (1) Spark 3.1.3, Hudi 0.11.0 
(1) Spark 3.2.1, Hudi 0.11.0 
{code:java}
spark.read.format("hudi").load("s3a://<base_path>/.hoodie/metadata/").show 
{code}
 
{code:java}
Caused by: java.lang.NullPointerException
  at 
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:178)
  at 
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:167)
  at 
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:163)
  at 
org.apache.hudi.HoodieBaseRelation$.$anonfun$createHFileReader$1(HoodieBaseRelation.scala:531)
  at 
org.apache.hudi.HoodieBaseRelation.$anonfun$createBaseFileReader$1(HoodieBaseRelation.scala:482)
  at 
org.apache.hudi.HoodieMergeOnReadRDD.readBaseFile(HoodieMergeOnReadRDD.scala:130)
  at 
org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:100)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
  at org.apache.spark.scheduler.Task.run(Task.scala:131)
  at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
  at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
  at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
  at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
  at java.lang.Thread.run(Thread.java:750) {code}
Spark shell:
{code:java}
./bin/spark-shell  \
     --master yarn \
     --deploy-mode client \
     --driver-memory 20g \
     --executor-memory 20g \
     --num-executors 2 \
     --executor-cores 8 \
     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
     --conf spark.kryoserializer.buffer=256m \
     --conf spark.kryoserializer.buffer.max=1024m \
     --jars /home/hadoop/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
     --conf 'spark.eventLog.enabled=true' --conf 
'spark.eventLog.dir=hdfs:///var/log/spark/apps' \
     --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
     --conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
 {code}
 

 

  was:
Environment: EMR 6.6.0, OSS Spark 3.2.1, Hudi master

Storage: S3

When loading the metadata table in Spark shell using the following code, it 
throws NullPointerException.

This also happens for the following combinations: (1) Spark 3.1.3, Hudi 0.11.0 
(1) Spark 3.2.1, Hudi 0.11.0 
{code:java}
spark.read.format("hudi").load("s3a://<base_path>/.hoodie/metadata/").show 
{code}
 
{code:java}
Caused by: java.lang.NullPointerException
  at 
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:178)
  at 
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:167)
  at 
org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:163)
  at 
org.apache.hudi.HoodieBaseRelation$.$anonfun$createHFileReader$1(HoodieBaseRelation.scala:531)
  at 
org.apache.hudi.HoodieBaseRelation.$anonfun$createBaseFileReader$1(HoodieBaseRelation.scala:482)
  at 
org.apache.hudi.HoodieMergeOnReadRDD.readBaseFile(HoodieMergeOnReadRDD.scala:130)
  at 
org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:100)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
  at org.apache.spark.scheduler.Task.run(Task.scala:131)
  at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
  at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
  at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
  at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
  at java.lang.Thread.run(Thread.java:750) {code}
{code:java}
Caused by: java.lang.NullPointerException
  at 
org.apache.hudi.metadata.HoodieBackedTableMetadata.getLogRecordScanner(HoodieBackedTableMetadata.java:484)
  at 
org.apache.hudi.HoodieMergeOnReadRDD$.scanLog(HoodieMergeOnReadRDD.scala:342)
  at 
org.apache.hudi.HoodieMergeOnReadRDD$LogFileIterator.<init>(HoodieMergeOnReadRDD.scala:173)
  at 
org.apache.hudi.HoodieMergeOnReadRDD$RecordMergingFileIterator.<init>(HoodieMergeOnReadRDD.scala:252)
  at 
org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:101)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
  at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
  at org.apache.spark.scheduler.Task.run(Task.scala:131)
  at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
  at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
  at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
  at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
  at java.lang.Thread.run(Thread.java:748) {code}
Spark shell:
{code:java}
./bin/spark-shell  \
     --master yarn \
     --deploy-mode client \
     --driver-memory 20g \
     --executor-memory 20g \
     --num-executors 2 \
     --executor-cores 8 \
     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
     --conf spark.kryoserializer.buffer=256m \
     --conf spark.kryoserializer.buffer.max=1024m \
     --jars /home/hadoop/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
     --conf 'spark.eventLog.enabled=true' --conf 
'spark.eventLog.dir=hdfs:///var/log/spark/apps' \
     --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
     --conf 
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
 {code}
 

 


> Reading metadata table on S3 using Spark throws NullPointerException during 
> createHFileReader
> ---------------------------------------------------------------------------------------------
>
>                 Key: HUDI-4205
>                 URL: https://issues.apache.org/jira/browse/HUDI-4205
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: metadata, spark
>    Affects Versions: 0.11.0
>            Reporter: Ethan Guo
>            Assignee: Ethan Guo
>            Priority: Blocker
>             Fix For: 0.12.0
>
>
> Environment: EMR 6.6.0, OSS Spark 3.2.1, Hudi master
> Storage: S3
> When loading the metadata table in Spark shell using the following code, it 
> throws NullPointerException.  In this case, the metadata table has the base 
> files in HFile format.
> This also happens for the following combinations: (1) Spark 3.1.3, Hudi 
> 0.11.0 (1) Spark 3.2.1, Hudi 0.11.0 
> {code:java}
> spark.read.format("hudi").load("s3a://<base_path>/.hoodie/metadata/").show 
> {code}
>  
> {code:java}
> Caused by: java.lang.NullPointerException
>   at 
> org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:178)
>   at 
> org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:167)
>   at 
> org.apache.hudi.org.apache.hadoop.hbase.io.hfile.CacheConfig.<init>(CacheConfig.java:163)
>   at 
> org.apache.hudi.HoodieBaseRelation$.$anonfun$createHFileReader$1(HoodieBaseRelation.scala:531)
>   at 
> org.apache.hudi.HoodieBaseRelation.$anonfun$createBaseFileReader$1(HoodieBaseRelation.scala:482)
>   at 
> org.apache.hudi.HoodieMergeOnReadRDD.readBaseFile(HoodieMergeOnReadRDD.scala:130)
>   at 
> org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:100)
>   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
>   at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
>   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
>   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
>   at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
>   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
>   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
>   at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
>   at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
>   at org.apache.spark.scheduler.Task.run(Task.scala:131)
>   at 
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
>   at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
>   at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
>   at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>   at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>   at java.lang.Thread.run(Thread.java:750) {code}
> Spark shell:
> {code:java}
> ./bin/spark-shell  \
>      --master yarn \
>      --deploy-mode client \
>      --driver-memory 20g \
>      --executor-memory 20g \
>      --num-executors 2 \
>      --executor-cores 8 \
>      --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
>      --conf spark.kryoserializer.buffer=256m \
>      --conf spark.kryoserializer.buffer.max=1024m \
>      --jars /home/hadoop/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
>      --conf 'spark.eventLog.enabled=true' --conf 
> 'spark.eventLog.dir=hdfs:///var/log/spark/apps' \
>      --conf 
> 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
>      --conf 
> 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
>  {code}
>  
>  



--
This message was sent by Atlassian Jira
(v8.20.7#820007)

Reply via email to