[ 
https://issues.apache.org/jira/browse/SPARK-2292?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14045576#comment-14045576
 ] 

Andrew Ash commented on SPARK-2292:
-----------------------------------

[~mkim] reported an issue with a very similar stacktrace to user@ three days 
ago, with repro.  It was for .mapToPair() failing rather than .reduceByKey()  
but the stacktrace has the same last few lines before the NPE.

{noformat}
        
org.apache.spark.api.java.JavaPairRDD$$anonfun$pairFunToScalaFun$1.apply(JavaPairRDD.scala:750)
        
org.apache.spark.api.java.JavaPairRDD$$anonfun$pairFunToScalaFun$1.apply(JavaPairRDD.scala:750)
        scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
{noformat}


Copied here for easy access:

{code}
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

public class Test {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
            .setMaster("spark://mymaster")
            .setAppName("MyApp")
            .setSparkHome("/my/spark/home");

        JavaSparkContext sc = new JavaSparkContext(conf);
        sc.addJar("/path/to/jar"); // ship the jar of this class
        JavaRDD<String> rdd = sc.textFile("/path/to/nums.csv”); // nums.csv 
simply has one integer per line
        JavaPairRDD<Integer, Integer> pairRdd = rdd.mapToPair(new 
MyPairFunction());

        System.out.println(pairRdd.collect());
    }

    private static final class MyPairFunction implements PairFunction<String, 
Integer, Integer> {
        private static final long serialVersionUID = 1L;

        @Override
        public Tuple2<Integer, Integer> call(String s) throws Exception {
            return new Tuple2<Integer, Integer>(Integer.parseInt(s), 
Integer.parseInt(s));
        }
    }
}
{code}

{noformat}
Exception in thread "main" 14/06/24 14:39:01 INFO scheduler.TaskSchedulerImpl: 
Removed TaskSet 0.0, whose tasks have all completed, from pool 
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0:0 
failed 4 times, most recent failure: Exception failure in TID 6 on host 
10.160.24.216: java.lang.NullPointerException
        
org.apache.spark.api.java.JavaPairRDD$$anonfun$pairFunToScalaFun$1.apply(JavaPairRDD.scala:750)
        
org.apache.spark.api.java.JavaPairRDD$$anonfun$pairFunToScalaFun$1.apply(JavaPairRDD.scala:750)
        scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        scala.collection.Iterator$class.foreach(Iterator.scala:727)
        scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
        scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
        
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
        scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
        scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
        scala.collection.AbstractIterator.to(Iterator.scala:1157)
        
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
        scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
        
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
        scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
        org.apache.spark.rdd.RDD$$anonfun$15.apply(RDD.scala:717)
        org.apache.spark.rdd.RDD$$anonfun$15.apply(RDD.scala:717)
        
org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1080)
        
org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1080)
        org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111)
        org.apache.spark.scheduler.Task.run(Task.scala:51)
        org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187)
        
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        java.lang.Thread.run(Thread.java:722)
Driver stacktrace:
        at 
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033)
        at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017)
        at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015)
        at 
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        at 
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015)
        at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
        at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
        at scala.Option.foreach(Option.scala:236)
        at 
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:633)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1207)
        at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
        at akka.actor.ActorCell.invoke(ActorCell.scala:456)
        at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
        at akka.dispatch.Mailbox.run(Mailbox.scala:219)
        at 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
        at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
        at 
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
{noformat}

> NullPointerException in JavaPairRDD.reduceByKey
> -----------------------------------------------
>
>                 Key: SPARK-2292
>                 URL: https://issues.apache.org/jira/browse/SPARK-2292
>             Project: Spark
>          Issue Type: Bug
>          Components: Spark Core
>    Affects Versions: 1.0.0
>         Environment: Spark 1.0.0, Standalone with a single slave running with 
> 4G memory allocation. The data file was 
>            Reporter: Bharath Ravi Kumar
>            Priority: Critical
>
> Invoking JavaPairRDD.reduceByKey results in an NPE:
> {code}
> 14/06/26 21:05:35 WARN scheduler.TaskSetManager: Loss was due to 
> java.lang.NullPointerException
> java.lang.NullPointerException
>       at 
> org.apache.spark.api.java.JavaPairRDD$$anonfun$pairFunToScalaFun$1.apply(JavaPairRDD.scala:750)
>       at 
> org.apache.spark.api.java.JavaPairRDD$$anonfun$pairFunToScalaFun$1.apply(JavaPairRDD.scala:750)
>       at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
>       at org.apache.spark.Aggregator.combineValuesByKey(Aggregator.scala:59)
>       at 
> org.apache.spark.rdd.PairRDDFunctions$$anonfun$1.apply(PairRDDFunctions.scala:96)
>       at 
> org.apache.spark.rdd.PairRDDFunctions$$anonfun$1.apply(PairRDDFunctions.scala:95)
>       at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:582)
>       at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:582)
>       at 
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>       at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
>       at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
>       at 
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
>       at 
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
>       at org.apache.spark.scheduler.Task.run(Task.scala:51)
>       at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187)
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)
>       at java.lang.Thread.run(Thread.java:722)
> {code}
>  This occurs only after migrating to the 1.0.0 API. The details of the code 
> the data file used to test are included in this gist : 
> https://gist.github.com/reachbach/d8977c8eb5f71f889301



--
This message was sent by Atlassian JIRA
(v6.2#6252)

Reply via email to