Michael Kaufmann created CRAIL-38:
-------------------------------------
Summary: Spark braodcast and shuffle files missing/corrupted
Key: CRAIL-38
URL: https://issues.apache.org/jira/browse/CRAIL-38
Project: Apache Crail
Issue Type: Bug
Reporter: Michael Kaufmann
When I used Crail in a serverless setting and more than 1 executor per node
broadcast and shuffle files are not being created reliably. During the read
phase of those files this causes data corruption and as a consequence
deserialization errors in Kryo.
{{java.io.IOException: com.esotericsoftware.kryo.KryoException: Error during
Java deserialization.}}
{{ at
org.apache.spark.broadcast.Utils$.tryOrIOException(CrailBroadcast.scala:123)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast.readBroadcastBlock(CrailBroadcast.scala:60)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast._value$lzycompute(CrailBroadcast.scala:37)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast._value(CrailBroadcast.scala:37)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast.getValue(CrailBroadcast.scala:44)}}
{{ at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)}}
{{ at
org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3$$anonfun$apply$9.apply(PythonRDD.scala:306)}}
{{ at
org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3$$anonfun$apply$9.apply(PythonRDD.scala:302)}}
{{ at scala.collection.immutable.List.foreach(List.scala:381)}}
{{ at
org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:302)}}
{{ at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1948)}}
{{ at
org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)}}
{{Caused by: com.esotericsoftware.kryo.KryoException: Error during Java
deserialization.}}
{{ at
com.esotericsoftware.kryo.serializers.JavaSerializer.read(JavaSerializer.java:65)}}
{{ at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)}}
{{ at
org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:246)}}
{{ at
org.apache.spark.serializer.CrailSparkDeserializationStream.readBroadcast(CrailSparkSerializer.scala:135)}}
{{ at
org.apache.spark.storage.CrailDispatcher.readBroadcast(CrailDispatcher.scala:334)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast$$anonfun$readBroadcastBlock$1.apply(CrailBroadcast.scala:73)}}
{{ at
org.apache.spark.broadcast.Utils$.tryOrIOException(CrailBroadcast.scala:118)}}
{{ ... 11 more}}
{{Caused by: java.io.StreamCorruptedException: invalid type code: 40}}
{{ at
java.io.ObjectInputStream$BlockDataInputStream.readBlockHeader(ObjectInputStream.java:2828)}}
{{ at
java.io.ObjectInputStream$BlockDataInputStream.refill(ObjectInputStream.java:2862)}}
{{ at
java.io.ObjectInputStream$BlockDataInputStream.read(ObjectInputStream.java:3021)}}
{{ at java.io.ObjectInputStream.read(ObjectInputStream.java:917)}}
{{ at java.io.InputStream.read(InputStream.java:101)}}
{{ at
org.apache.spark.util.Utils$$anonfun$copyStream$1.apply$mcJ$sp(Utils.scala:340)}}
{{ at
org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:327)}}
{{ at
org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:327)}}
{{ at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1337)}}
{{ at org.apache.spark.util.Utils$.copyStream(Utils.scala:348)}}
{{ at
org.apache.spark.api.python.PythonBroadcast$$anonfun$readObject$1$$anonfun$apply$mcJ$sp$1.apply$mcJ$sp(PythonRDD.scala:968)}}
{{ at
org.apache.spark.api.python.PythonBroadcast$$anonfun$readObject$1$$anonfun$apply$mcJ$sp$1.apply(PythonRDD.scala:968)}}
{{ at
org.apache.spark.api.python.PythonBroadcast$$anonfun$readObject$1$$anonfun$apply$mcJ$sp$1.apply(PythonRDD.scala:968)}}
{{ at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1337)}}
{{ at
org.apache.spark.api.python.PythonBroadcast$$anonfun$readObject$1.apply$mcJ$sp(PythonRDD.scala:969)}}
{{ at
org.apache.spark.api.python.PythonBroadcast$$anonfun$readObject$1.apply(PythonRDD.scala:962)}}
{{ at
org.apache.spark.api.python.PythonBroadcast$$anonfun$readObject$1.apply(PythonRDD.scala:962)}}
{{ at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1303)}}
{{ at
org.apache.spark.api.python.PythonBroadcast.readObject(PythonRDD.scala:962)}}
{{ at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)}}
{{ at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)}}
{{ at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)}}
{{ at java.lang.reflect.Method.invoke(Method.java:498)}}
{{ at
java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1058)}}
{{ at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2136)}}
{{ at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)}}
{{ at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)}}
{{ at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422)}}
{{ at
com.esotericsoftware.kryo.serializers.JavaSerializer.read(JavaSerializer.java:63)}}
{{ ... 17 more}}
{{18/05/29 10:34:21 WARN 49 TaskSetManager: Lost task 8.0 in stage 0.0 (TID 1,
flex16.zurich.ibm.com, executor 11000): java.io.IOException:
com.esotericsoftware.kryo.KryoException: Encountered unregistered class ID:
5135}}
{{ at
org.apache.spark.broadcast.Utils$.tryOrIOException(CrailBroadcast.scala:123)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast.readBroadcastBlock(CrailBroadcast.scala:60)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast._value$lzycompute(CrailBroadcast.scala:37)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast._value(CrailBroadcast.scala:37)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast.getValue(CrailBroadcast.scala:44)}}
{{ at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)}}
{{ at
org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3$$anonfun$apply$9.apply(PythonRDD.scala:306)}}
{{ at
org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3$$anonfun$apply$9.apply(PythonRDD.scala:302)}}
{{ at scala.collection.immutable.List.foreach(List.scala:381)}}
{{ at
org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:302)}}
{{ at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1948)}}
{{ at
org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)}}
{{Caused by: com.esotericsoftware.kryo.KryoException: Encountered unregistered
class ID: 5135}}
{{ at
com.esotericsoftware.kryo.util.DefaultClassResolver.readClass(DefaultClassResolver.java:137)}}
{{ at com.esotericsoftware.kryo.Kryo.readClass(Kryo.java:670)}}
{{ at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:781)}}
{{ at
org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:246)}}
{{ at
org.apache.spark.serializer.CrailSparkDeserializationStream.readBroadcast(CrailSparkSerializer.scala:135)}}
{{ at
org.apache.spark.storage.CrailDispatcher.readBroadcast(CrailDispatcher.scala:334)}}
{{ at
org.apache.spark.broadcast.CrailBroadcast$$anonfun$readBroadcastBlock$1.apply(CrailBroadcast.scala:73)}}
{{ at
org.apache.spark.broadcast.Utils$.tryOrIOException(CrailBroadcast.scala:118)}}
{{ ... 11 more}}
For that to happen, it is enough to run a single application, i.e. executors
stay associated with the same driver over the entire runtime of the application.
In the above case, all broadcast files appear to be present, though.
{{18/05/29 10:40:21 INFO crail: passive data client}}
{{-rw-rw-rw- 1 stu stu 3150291 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_0}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_1}}
{{-rw-rw-rw- 1 stu stu 3150819 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_10}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_11}}
{{-rw-rw-rw- 1 stu stu 3149952 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_12}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_13}}
{{-rw-rw-rw- 1 stu stu 3151172 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_14}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_15}}
{{-rw-rw-rw- 1 stu stu 3150579 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_16}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_17}}
{{-rw-rw-rw- 1 stu stu 3150977 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_18}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_19}}
{{-rw-rw-rw- 1 stu stu 3150315 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_2}}
{{-rw-rw-rw- 1 stu stu 3150254 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_20}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_21}}
{{-rw-rw-rw- 1 stu stu 9061 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_22}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_3}}
{{-rw-rw-rw- 1 stu stu 3150819 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_4}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_5}}
{{-rw-rw-rw- 1 stu stu 3150495 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_6}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_7}}
{{-rw-rw-rw- 1 stu stu 3150723 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_8}}
{{-rw-rw-rw- 1 stu stu 80599 2018-05-29 10:34
/spark/app-20180529103415-0046/broadcast/broadcast_9}}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)