Hi, all My spark program always gives me the error "java.lang.OutOfMemoryError: Java heap space" in my standalone cluster, here is my code:
object SimCalcuTotal { def main(args: Array[String]) { val sc = new SparkContext("spark://192.168.2.184:7077", "Sim Calcu Total", "/usr/local/spark-0.9.0-incubating-bin-hadoop2", Seq("/home/deployer/score-calcu-assembly-1.0.jar")) // val sc = new SparkContext("local", "Score Calcu Total") val mongoRDD = sc.textFile("/home/deployer/uris.dat", 200) val jsonRDD = mongoRDD.map(arg => new JSONObject(arg)) val newRDD = jsonRDD.map(arg => { // 0.5 for test var score = 0.5 arg.put("score", score) arg }) val resourcesRDD = jsonRDD.map(arg => arg.get("rid").toString.toLong).distinct // the program crashes at this line of code val bcResources = sc.broadcast(resourcesRDD.collect.toList) val resourceScoresRDD = newRDD.map(arg => (arg.get("rid").toString.toLong, (arg.get("zid").toString, arg.get("score").asInstanceOf[Number].doubleValue))).groupByKey() val resouceScores = sc.broadcast(resourceScoresRDD.collect.toMap) def calSim(item1 : Long, item2 : Long) = { val iv1 = resouceScores.value(item1) val iv2 = resouceScores.value(item2) // 0.5 for test var distance = 0.5 if(distance > 0.05){ var json = new JSONObject() json.put("_id", item1.toString + item2.toString) json.put("rid1", item1) json.put("rid2", item2) json.put("sim", distance) json } else null } //val saveRDD = newRDD.map(arg => arg.toString) //newRDD.saveAsTextFile(args(1).toString) val similarityRDD = resourcesRDD.flatMap(resource => { for(other <- bcResources.value if resource > other) yield calSim(resource, other)}).filter(arg => arg != null) similarityRDD.saveAsTextFile("/home/deployer/sim") } } The data file “/home/deployer/uris.dat” is 2G with lines like this : { "id" : 1, "a" : { "0" : 1 }, "rid" : 5487628, "zid" : "10550869" } And here is my spark-env.sh export SCALA_HOME=/usr/local/scala-2.10.3 export SPARK_MASTER_IP=192.168.2.184 export SPARK_MASTER_PORT=7077 export SPARK_LOCAL_IP=192.168.2.182 export SPARK_WORKER_MEMORY=20g export SPARK_MEM=10g export SPARK_JAVA_OPTS="-Xms4g -Xmx40g -XX:MaxPermSize=10g -XX:-UseGCOverheadLimit" There are two processes on my server when the spark program is running(before it crashes): java -cp :/usr/local/spark-0.9.0-incubating-bin-hadoop2/conf:/usr/local/spark-0.9.0-incubating-bin-hadoop2/assembly/target/scala-2.10/spark-assembly_2.10-0.9.0-incubating-hadoop2.2.0.jar -Xms4g -Xmx40g -XX:MaxPermSize=10g -XX:-UseGCOverheadLimit -Xms4g -Xmx40g -XX:MaxPermSize=10g -XX:-UseGCOverheadLimit -Xms512M -Xmx512M org.apache.spark.executor.CoarseGrainedExecutorBackend akka.tcp://spark@192.168.2.183:51339/user/CoarseGrainedScheduler 0 192.168.2.182 16 akka.tcp://sparkWorker@192.168.2.182:45588/user/Worker app-20140415172433-0001 java -cp :/usr/local/spark-0.9.0-incubating-bin-hadoop2/conf:/usr/local/spark-0.9.0-incubating-bin-hadoop2/assembly/target/scala-2.10/spark-assembly_2.10-0.9.0-incubating-hadoop2.2.0.jar -Dspark.akka.logLifecycleEvents=true -Djava.library.path= -Xms512m -Xmx512m org.apache.spark.deploy.worker.Worker spark://192.168.2.184:7077 Is there anybody who can help me? Thanks very much!! -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Spark-program-thows-OutOfMemoryError-tp4268.html Sent from the Apache Spark User List mailing list archive at Nabble.com.