[ 
https://issues.apache.org/jira/browse/SPARK-23351?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16362156#comment-16362156
 ] 

David Ahern edited comment on SPARK-23351 at 2/13/18 11:17 AM:
---------------------------------------------------------------

hi, yes - it is HDFS

am on Cloudera... only 2.2.0 is available from them for now

will have to wait for the upgrade... think they only upgrade on major versions 
e.g. 2.3.0

just for the record, had another type of exception occur a while  back... 
probably related, but it was mostly the one i reported above

18/01/26 09:56:14 INFO yarn.Client: Application report for 
application_1513677310005_0840 (state: FINISHED) 18/01/26 09:56:14 INFO 
yarn.Client: client token: N/A diagnostics: User class threw exception: 
org.apache.spark.sql.streaming.StreamingQueryException: Job aborted due to 
stage failure: Task 156 in stage 1.0 failed 4 times, most recent failure: Lost 
task 156.3 in stage 1.0 (TID 392, gbslixaacspa06u.metis.prd, executor 1): 
java.io.EOFException: Stream ended prematurely at 
org.apache.spark.io.LZ4BlockInputStream.readFully(LZ4BlockInputStream.java:230) 
at org.apache.spark.io.LZ4BlockInputStream.refill(LZ4BlockInputStream.java:203) 
at org.apache.spark.io.LZ4BlockInputStream.read(LZ4BlockInputStream.java:125) 
at java.io.DataInputStream.read(DataInputStream.java:149) at 
org.spark_project.guava.io.ByteStreams.read(ByteStreams.java:899) at 
org.spark_project.guava.io.ByteStreams.readFully(ByteStreams.java:733) at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$readSnapshotFile(HDFSBackedStateStoreProvider.scala:489)
 at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$$anonfun$org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap$1.apply(HDFSBackedStateStoreProvider.scala:359)
 at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$$anonfun$org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap$1.apply(HDFSBackedStateStoreProvider.scala:358)
 at scala.Option.getOrElse(Option.scala:121) at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap(HDFSBackedStateStoreProvider.scala:358)
 at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.getStore(HDFSBackedStateStoreProvider.scala:265)
 at 
org.apache.spark.sql.execution.streaming.state.StateStore$.get(StateStore.scala:200)
 at 
org.apache.spark.sql.execution.streaming.state.StateStoreRDD.compute(StateStoreRDD.scala:61)
 at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at 
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at 
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at 
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at 
org.apache.spark.scheduler.Task.run(Task.scala:108) at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335) at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
at java.lang.Thread.run(Thread.java:745)


was (Author: davidahern):
hi, am on Cloudera... only 2.2.0 is available from them for now

will have to wait for the upgrade... think they only upgrade on major versions 
e.g. 2.3.0

just for the record, had another type of exception occur a while  back... 
probably related, but it was mostly the one i reported above

18/01/26 09:56:14 INFO yarn.Client: Application report for 
application_1513677310005_0840 (state: FINISHED) 18/01/26 09:56:14 INFO 
yarn.Client: client token: N/A diagnostics: User class threw exception: 
org.apache.spark.sql.streaming.StreamingQueryException: Job aborted due to 
stage failure: Task 156 in stage 1.0 failed 4 times, most recent failure: Lost 
task 156.3 in stage 1.0 (TID 392, gbslixaacspa06u.metis.prd, executor 1): 
java.io.EOFException: Stream ended prematurely at 
org.apache.spark.io.LZ4BlockInputStream.readFully(LZ4BlockInputStream.java:230) 
at org.apache.spark.io.LZ4BlockInputStream.refill(LZ4BlockInputStream.java:203) 
at org.apache.spark.io.LZ4BlockInputStream.read(LZ4BlockInputStream.java:125) 
at java.io.DataInputStream.read(DataInputStream.java:149) at 
org.spark_project.guava.io.ByteStreams.read(ByteStreams.java:899) at 
org.spark_project.guava.io.ByteStreams.readFully(ByteStreams.java:733) at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$readSnapshotFile(HDFSBackedStateStoreProvider.scala:489)
 at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$$anonfun$org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap$1.apply(HDFSBackedStateStoreProvider.scala:359)
 at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$$anonfun$org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap$1.apply(HDFSBackedStateStoreProvider.scala:358)
 at scala.Option.getOrElse(Option.scala:121) at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap(HDFSBackedStateStoreProvider.scala:358)
 at 
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.getStore(HDFSBackedStateStoreProvider.scala:265)
 at 
org.apache.spark.sql.execution.streaming.state.StateStore$.get(StateStore.scala:200)
 at 
org.apache.spark.sql.execution.streaming.state.StateStoreRDD.compute(StateStoreRDD.scala:61)
 at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at 
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at 
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at 
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at 
org.apache.spark.scheduler.Task.run(Task.scala:108) at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335) at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
at java.lang.Thread.run(Thread.java:745)

> checkpoint corruption in long running application
> -------------------------------------------------
>
>                 Key: SPARK-23351
>                 URL: https://issues.apache.org/jira/browse/SPARK-23351
>             Project: Spark
>          Issue Type: Bug
>          Components: Structured Streaming
>    Affects Versions: 2.2.0
>            Reporter: David Ahern
>            Priority: Major
>
> hi, after leaving my (somewhat high volume) Structured Streaming application 
> running for some time, i get the following exception.  The same exception 
> also repeats when i try to restart the application.  The only way to get the 
> application back running is to clear the checkpoint directory which is far 
> from ideal.
> Maybe a stream is not being flushed/closed properly internally by Spark when 
> checkpointing?
>  
>  User class threw exception: 
> org.apache.spark.sql.streaming.StreamingQueryException: Job aborted due to 
> stage failure: Task 55 in stage 1.0 failed 4 times, most recent failure: Lost 
> task 55.3 in stage 1.0 (TID 240, gbslixaacspa04u.metis.prd, executor 2): 
> java.io.EOFException
>  at java.io.DataInputStream.readInt(DataInputStream.java:392)
>  at 
> org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$readSnapshotFile(HDFSBackedStateStoreProvider.scala:481)
>  at 
> org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$$anonfun$org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap$1.apply(HDFSBackedStateStoreProvider.scala:359)
>  at 
> org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$$anonfun$org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap$1.apply(HDFSBackedStateStoreProvider.scala:358)
>  at scala.Option.getOrElse(Option.scala:121)
>  at 
> org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.org$apache$spark$sql$execution$streaming$state$HDFSBackedStateStoreProvider$$loadMap(HDFSBackedStateStoreProvider.scala:358)
>  at 
> org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider.getStore(HDFSBackedStateStoreProvider.scala:265)
>  at 
> org.apache.spark.sql.execution.streaming.state.StateStore$.get(StateStore.scala:200)
>  at 
> org.apache.spark.sql.execution.streaming.state.StateStoreRDD.compute(StateStoreRDD.scala:61)
>  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
>  at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
>  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
>  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
>  at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
>  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
>  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
>  at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
>  at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
>  at org.apache.spark.scheduler.Task.run(Task.scala:108)
>  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
>  at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
>  at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
>  at java.lang.Thread.run(Thread.java:745)



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to