Hi, all

My spark program always gives me the error "java.lang.OutOfMemoryError: Java
heap space" in my standalone cluster, here is my code:

object SimCalcuTotal { 
    def main(args: Array[String]) { 
    val sc = new SparkContext("spark://192.168.2.184:7077", "Sim Calcu
Total", "/usr/local/spark-0.9.0-incubating-bin-hadoop2",
Seq("/home/deployer/score-calcu-assembly-1.0.jar")) 
    // val sc = new SparkContext("local", "Score Calcu Total") 

    val mongoRDD = sc.textFile("/home/deployer/uris.dat", 200) 
    val jsonRDD = mongoRDD.map(arg => new JSONObject(arg)) 

    val newRDD = jsonRDD.map(arg => { 
        // 0.5 for test 
        var score = 0.5 
        arg.put("score", score) 
        arg 
        }) 

    val resourcesRDD = jsonRDD.map(arg =>
arg.get("rid").toString.toLong).distinct 

    // the program crashes at this line of code 
    val bcResources = sc.broadcast(resourcesRDD.collect.toList) 
    val resourceScoresRDD = newRDD.map(arg =>
(arg.get("rid").toString.toLong, (arg.get("zid").toString,
arg.get("score").asInstanceOf[Number].doubleValue))).groupByKey() 
    val resouceScores = sc.broadcast(resourceScoresRDD.collect.toMap) 

    def calSim(item1 : Long, item2 : Long) = { 
        val iv1 = resouceScores.value(item1) 
        val iv2 = resouceScores.value(item2) 

        // 0.5 for test 
        var distance = 0.5 
        if(distance > 0.05){ 
            var json = new JSONObject() 
            json.put("_id", item1.toString + item2.toString) 
            json.put("rid1", item1) 
            json.put("rid2", item2) 
            json.put("sim", distance) 
            json 
            } 
        else null 
        } 

    //val saveRDD = newRDD.map(arg => arg.toString) 
    //newRDD.saveAsTextFile(args(1).toString) 
    val similarityRDD = resourcesRDD.flatMap(resource => { for(other <-
bcResources.value if resource > other) yield calSim(resource,
other)}).filter(arg => arg != null) 
    similarityRDD.saveAsTextFile("/home/deployer/sim") 
} 
}

The data file “/home/deployer/uris.dat” is 2G  with lines like this :     {
"id" : 1, "a" : { "0" : 1 }, "rid" : 5487628, "zid" : "10550869" }

And here is my spark-env.sh
    export SCALA_HOME=/usr/local/scala-2.10.3
    export SPARK_MASTER_IP=192.168.2.184
    export SPARK_MASTER_PORT=7077
    export SPARK_LOCAL_IP=192.168.2.182
    export SPARK_WORKER_MEMORY=20g
    export SPARK_MEM=10g
    export SPARK_JAVA_OPTS="-Xms4g -Xmx40g -XX:MaxPermSize=10g
-XX:-UseGCOverheadLimit"

There are two processes on my server when the spark program is
running(before it crashes): 
    java -cp
:/usr/local/spark-0.9.0-incubating-bin-hadoop2/conf:/usr/local/spark-0.9.0-incubating-bin-hadoop2/assembly/target/scala-2.10/spark-assembly_2.10-0.9.0-incubating-hadoop2.2.0.jar
-Xms4g -Xmx40g -XX:MaxPermSize=10g -XX:-UseGCOverheadLimit -Xms4g -Xmx40g
-XX:MaxPermSize=10g -XX:-UseGCOverheadLimit -Xms512M -Xmx512M
org.apache.spark.executor.CoarseGrainedExecutorBackend
akka.tcp://spark@192.168.2.183:51339/user/CoarseGrainedScheduler 0
192.168.2.182 16 akka.tcp://sparkWorker@192.168.2.182:45588/user/Worker
app-20140415172433-0001 

    java -cp
:/usr/local/spark-0.9.0-incubating-bin-hadoop2/conf:/usr/local/spark-0.9.0-incubating-bin-hadoop2/assembly/target/scala-2.10/spark-assembly_2.10-0.9.0-incubating-hadoop2.2.0.jar
-Dspark.akka.logLifecycleEvents=true -Djava.library.path= -Xms512m -Xmx512m
org.apache.spark.deploy.worker.Worker spark://192.168.2.184:7077

Is there anybody who can help me? Thanks very much!!



--
View this message in context: 
http://apache-spark-user-list.1001560.n3.nabble.com/Spark-program-thows-OutOfMemoryError-tp4268.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

Reply via email to