Re: How to debug this error?

2014-08-29 Thread Yanbo Liang
It's not allowed to use RDD in map function.
RDD can only operated at driver of spark program.
At your case, group RDD can't be found at every executor.

I guess you want to implement subquery like operation, try to use
RDD.intersection() or join()


2014-08-29 12:43 GMT+08:00 Gary Zhao garyz...@gmail.com:

 Hello

 I'm new to Spark and playing around, but saw the following error. Could
 anyone to help on it?

 Thanks
 Gary



 scala c
 res15: org.apache.spark.rdd.RDD[String] = FlatMappedRDD[7] at flatMap at
 console:23

 scala group
 res16: org.apache.spark.rdd.RDD[(String, Iterable[String])] =
 MappedValuesRDD[5] at groupByKey at console:19

 val d = c.map(i=group.filter(_._1 ==i ))

 d.first

 14/08/29 04:39:33 INFO TaskSchedulerImpl: Cancelling stage 28
 14/08/29 04:39:33 INFO DAGScheduler: Failed to run first at console:28
 org.apache.spark.SparkException: Job aborted due to stage failure: Task
 28.0:180 failed 4 times, most recent failure: Exception failure in TID 3605
 on host mcs-spark-slave1-staging.snc1: java.lang.NullPointerException
 org.apache.spark.rdd.RDD.filter(RDD.scala:282)
 $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(console:25)
 $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(console:25)
 scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
 scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
 scala.collection.Iterator$class.foreach(Iterator.scala:727)
 scala.collection.AbstractIterator.foreach(Iterator.scala:1157)

 scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)

 scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)

 scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
 scala.collection.TraversableOnce$class.to
 (TraversableOnce.scala:273)
 scala.collection.AbstractIterator.to(Iterator.scala:1157)

 scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
 scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)

 scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
 scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
 org.apache.spark.rdd.RDD$$anonfun$28.apply(RDD.scala:1003)
 org.apache.spark.rdd.RDD$$anonfun$28.apply(RDD.scala:1003)

 org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1083)

 org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1083)
 org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111)
 org.apache.spark.scheduler.Task.run(Task.scala:51)

 org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)

 java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)

 java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
 java.lang.Thread.run(Thread.java:744)
 Driver stacktrace:
 at org.apache.spark.scheduler.DAGScheduler.org
 $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1049)
  at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1033)
 at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1031)
  at
 scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
 at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
  at
 org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1031)
 at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:635)
  at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:635)
 at scala.Option.foreach(Option.scala:236)
  at
 org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:635)
 at
 org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1234)
  at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
 at akka.actor.ActorCell.invoke(ActorCell.scala:456)
  at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
 at akka.dispatch.Mailbox.run(Mailbox.scala:219)
  at
 akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
 at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
  at
 scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
 at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
  at
 scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)



How to debug this error?

2014-08-28 Thread Gary Zhao
Hello

I'm new to Spark and playing around, but saw the following error. Could
anyone to help on it?

Thanks
Gary



scala c
res15: org.apache.spark.rdd.RDD[String] = FlatMappedRDD[7] at flatMap at
console:23

scala group
res16: org.apache.spark.rdd.RDD[(String, Iterable[String])] =
MappedValuesRDD[5] at groupByKey at console:19

val d = c.map(i=group.filter(_._1 ==i ))

d.first

14/08/29 04:39:33 INFO TaskSchedulerImpl: Cancelling stage 28
14/08/29 04:39:33 INFO DAGScheduler: Failed to run first at console:28
org.apache.spark.SparkException: Job aborted due to stage failure: Task
28.0:180 failed 4 times, most recent failure: Exception failure in TID 3605
on host mcs-spark-slave1-staging.snc1: java.lang.NullPointerException
org.apache.spark.rdd.RDD.filter(RDD.scala:282)
$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(console:25)
$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(console:25)
scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
scala.collection.Iterator$class.foreach(Iterator.scala:727)
scala.collection.AbstractIterator.foreach(Iterator.scala:1157)

scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)

scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)

scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
scala.collection.AbstractIterator.to(Iterator.scala:1157)

scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)

scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
org.apache.spark.rdd.RDD$$anonfun$28.apply(RDD.scala:1003)
org.apache.spark.rdd.RDD$$anonfun$28.apply(RDD.scala:1003)

org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1083)

org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1083)
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111)
org.apache.spark.scheduler.Task.run(Task.scala:51)

org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)

java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)

java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
java.lang.Thread.run(Thread.java:744)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1049)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1033)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1031)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1031)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:635)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:635)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:635)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1234)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)