my code is here: from pyspark import SparkConf, SparkContext
def Undirect(edge): vector = edge.strip().split('\t') if(vector[0].isdigit()): return [(vector[0], vector[1])] return [] conf = SparkConf() conf.setMaster("spark://compute-0-14:7077") conf.setAppName("adjacencylist") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf = conf) file = sc.textFile("file:///home/xzhang/data/soc-LiveJournal1.txt") records = file.flatMap(lambda line: Undirect(line)).reduceByKey(lambda a, b: a + "\t" + b ) #print(records.count()) #records = records.sortByKey() records = records.map(lambda line: line[0] + "\t" + line[1]) records.saveAsTextFile("file:///home/xzhang/data/result") -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/spark-is-running-extremely-slow-with-larger-data-set-like-2G-tp17152p17153.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org