Data is in hdfs, running 2 workers with 1 GB memory
datafile1 is ~9KB and datafile2 is ~216MB. Cant get it to run at all...
Tried various different settings for the number of tasks, all the way from
2 to 1024.
Anyone else seen similar issues.
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.storage.StorageLevel
object SimpleApp {
def main (args: Array[String]) {
System.setProperty("spark.local.dir","/spark-0.8.0-incubating-bin-cdh4/tmp");
System.setProperty("spark.serializer",
"org.apache.spark.serializer.KryoSerializer")
System.setProperty("spark.akka.timeout", "30") //in seconds
System.setProperty("spark.executor.memory","1024m")
System.setProperty("spark.akka.frameSize", "2000") //in MB
System.setProperty("spark.akka.threads","8")
val dataFile1 = "hdfs://dev01:8020/user/sa/datafile1"
val dataFile2 = "hdfs://dev01:8020/user/sa/datafile2"
val sc = new SparkContext("spark://dev01:7077", "Simple App",
"/spark-0.8.0-incubating-bin-cdh4",
List("target/scala-2.9.3/simple-project_2.9.3-1.0.jar"))
val data10 = sc.textFile(dataFile1, 1024)
val data11 = data10.map(x => x.split("|"))
val data12 = data11.map( x => (x(1).toInt -> x) )
val data20 = sc.textFile(dataFile2, 1024)
val data21 = data20.map(x => x.split("|"))
val data22 = data21.map(x => (x(1).toInt -> x))
val data3 = data12.join(data22, 1024)
//val data4 = data3.distinct(4)
//val numAs = data10.count()
//val numBs = data20.count()
//val numCs = data3.count()
//val numDs = data4.count()
println("Total records after join is %s".format( data3.count()))
Thanks,