[
https://issues.apache.org/jira/browse/SPARK-18802?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Bingozz updated SPARK-18802:
----------------------------
Description:
I installed spark-2.0.1-bin-hadoop2.7 on my spark cluster with a master and
four workers.
Both scala versions are 2.11.8 on my local machine and the spark cluster
machines, and it both runs well if I use the spark-shell to run apps such as
WordCount on local and remote master.
On my local machine, I added dependencies simplily from directory
`spark-2.0.1-bin-hadoop2.7/jars` in my project on intellij IDEA.But it runs
well if I just load the file from the hdfs, and fails if I do some WordCount
based on the loaded file.
My codes are blew:
```
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
object topK {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("test_spark")
.setMaster("spark://10.112.29.56:7077")
val sc = new SparkContext(conf)
val lines = sc.textFile("hdfs://10.112.28.38:9000/user/root/covtype")
println(lines.count())
// val count = lines.flatMap(s=>s.split(",")).map(s=>(s, 1)).reduceByKey((a,
b) => a+b)
// println(count.count() + "\n")
sc.stop()
println("helloworld")
}
}
```
And the error is blew:
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to
stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost
task 0.3 in stage 0.0 (TID 5, 10.112.29.80): java.lang.ClassCastException:
cannot assign instance of scala.collection.immutable.List$SerializationProxy to
field org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type
scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
at
java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2133)
at
java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305)
at
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2024)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
at
org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at
org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:71)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD.count(RDD.scala:1134)
at topK$.main(topK.scala:16)
at topK.main(topK.scala)
Caused by: java.lang.ClassCastException: cannot assign instance of
scala.collection.immutable.List$SerializationProxy to field
org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type
scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
at
java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2133)
at
java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305)
at
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2024)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
at
org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at
org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:71)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
was:
I installed spark-2.0.1-bin-hadoop2.7 on my spark cluster with a master and
four workers.
Both scala versions are 2.11.8 on my local machine and the spark cluster
machines, and it both runs well if I use the spark-shell to run apps such as
WordCount.
On my local machine, I added dependencies simplily from directory
`spark-2.0.1-bin-hadoop2.7/jars` in my project on intellij IDEA.But it runs
well if I just load the file from the hdfs, and fails if I do some WordCount
based on the loaded file.
My codes are blew:
```
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
object topK {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("test_spark")
.setMaster("spark://10.112.29.56:7077")
val sc = new SparkContext(conf)
val lines = sc.textFile("hdfs://10.112.28.38:9000/user/root/covtype")
println(lines.count())
// val count = lines.flatMap(s=>s.split(",")).map(s=>(s, 1)).reduceByKey((a,
b) => a+b)
// println(count.count() + "\n")
sc.stop()
println("helloworld")
}
}
```
And the error is blew:
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to
stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost
task 0.3 in stage 0.0 (TID 5, 10.112.29.80): java.lang.ClassCastException:
cannot assign instance of scala.collection.immutable.List$SerializationProxy to
field org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type
scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
at
java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2133)
at
java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305)
at
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2024)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
at
org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at
org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:71)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD.count(RDD.scala:1134)
at topK$.main(topK.scala:16)
at topK.main(topK.scala)
Caused by: java.lang.ClassCastException: cannot assign instance of
scala.collection.immutable.List$SerializationProxy to field
org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type
scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
at
java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2133)
at
java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305)
at
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2024)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
at
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
at
org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at
org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:71)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
> java.lang.ClassCastException in a simple spark application
> ----------------------------------------------------------
>
> Key: SPARK-18802
> URL: https://issues.apache.org/jira/browse/SPARK-18802
> Project: Spark
> Issue Type: Bug
> Affects Versions: 2.0.1
> Reporter: Bingozz
>
> I installed spark-2.0.1-bin-hadoop2.7 on my spark cluster with a master and
> four workers.
> Both scala versions are 2.11.8 on my local machine and the spark cluster
> machines, and it both runs well if I use the spark-shell to run apps such as
> WordCount on local and remote master.
> On my local machine, I added dependencies simplily from directory
> `spark-2.0.1-bin-hadoop2.7/jars` in my project on intellij IDEA.But it runs
> well if I just load the file from the hdfs, and fails if I do some WordCount
> based on the loaded file.
> My codes are blew:
> ```
> import org.apache.spark.SparkContext
> import org.apache.spark.SparkConf
> object topK {
> def main(args: Array[String]): Unit = {
> val conf = new SparkConf().setAppName("test_spark")
> .setMaster("spark://10.112.29.56:7077")
> val sc = new SparkContext(conf)
> val lines = sc.textFile("hdfs://10.112.28.38:9000/user/root/covtype")
> println(lines.count())
> // val count = lines.flatMap(s=>s.split(",")).map(s=>(s,
> 1)).reduceByKey((a, b) => a+b)
> // println(count.count() + "\n")
> sc.stop()
> println("helloworld")
> }
> }
> ```
> And the error is blew:
> Exception in thread "main" org.apache.spark.SparkException: Job aborted due
> to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure:
> Lost task 0.3 in stage 0.0 (TID 5, 10.112.29.80):
> java.lang.ClassCastException: cannot assign instance of
> scala.collection.immutable.List$SerializationProxy to field
> org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type
> scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
> at
> java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2133)
> at
> java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305)
> at
> java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2024)
> at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
> at
> java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
> at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
> at
> java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
> at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
> at
> java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
> at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
> at java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
> at
> org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
> at
> org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
> at
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:71)
> at
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
> at org.apache.spark.scheduler.Task.run(Task.scala:86)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> Driver stacktrace:
> at
> org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
> at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
> at scala.Option.foreach(Option.scala:257)
> at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> at
> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
> at org.apache.spark.rdd.RDD.count(RDD.scala:1134)
> at topK$.main(topK.scala:16)
> at topK.main(topK.scala)
> Caused by: java.lang.ClassCastException: cannot assign instance of
> scala.collection.immutable.List$SerializationProxy to field
> org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type
> scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
> at
> java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2133)
> at
> java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1305)
> at
> java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2024)
> at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
> at
> java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
> at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
> at
> java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
> at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
> at
> java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
> at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
> at java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
> at
> org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
> at
> org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
> at
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:71)
> at
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
> at org.apache.spark.scheduler.Task.run(Task.scala:86)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]