Data is in hdfs, running 2 workers with 1 GB memory
datafile1 is ~9KB and datafile2 is ~216MB. Cant get it to run at all...
Tried various different settings for the number of tasks, all the way from
2 to 1024.
Anyone else seen similar issues.


import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.storage.StorageLevel

object SimpleApp {

      def main (args: Array[String]) {


System.setProperty("spark.local.dir","/spark-0.8.0-incubating-bin-cdh4/tmp");
      System.setProperty("spark.serializer",
"org.apache.spark.serializer.KryoSerializer")
      System.setProperty("spark.akka.timeout", "30")  //in seconds

      System.setProperty("spark.executor.memory","1024m")
      System.setProperty("spark.akka.frameSize", "2000")  //in MB
      System.setProperty("spark.akka.threads","8")

      val dataFile1 = "hdfs://dev01:8020/user/sa/datafile1"
      val dataFile2 = "hdfs://dev01:8020/user/sa/datafile2"
      val sc = new SparkContext("spark://dev01:7077", "Simple App",
"/spark-0.8.0-incubating-bin-cdh4",
      List("target/scala-2.9.3/simple-project_2.9.3-1.0.jar"))

      val data10 = sc.textFile(dataFile1, 1024)
      val data11 = data10.map(x => x.split("|"))
      val data12 = data11.map( x  =>  (x(1).toInt -> x) )

      val data20 = sc.textFile(dataFile2, 1024)
      val data21 = data20.map(x => x.split("|"))
      val data22 = data21.map(x => (x(1).toInt -> x))

      val data3 = data12.join(data22, 1024)
      //val data4 = data3.distinct(4)
      //val numAs = data10.count()
      //val numBs = data20.count()
      //val numCs = data3.count()
      //val numDs = data4.count()
println("Total records after join is %s".format( data3.count()))

Thanks,

Reply via email to