I'm trying to run wordcount on several files, but stuck in failing to pass
the output from one file to another. 
Any help would be appreciate.

sc = SparkContext()
for datafile in inputfiles:
     lines = sc.textFile(indir + "/" + datafile, 1)
     counts = lines.flatMap(lambda x: x.split(' ')) \
                        .map(lambda x: (x, 1)) \
                        .reduceByKey(add)
     output = counts.collect()

outf = open(outfile, 'w')
for (word, count) in output:
    #ngram_count = ngram_count + count
    if (count >= 1):
        outf.write(word.encode('utf-8') + '\t' + str(count) + '\n')
        #eff_ngram_count = eff_ngram_count + count
outf.close()





--
View this message in context: 
http://apache-spark-user-list.1001560.n3.nabble.com/wordcount-accross-several-files-tp20314.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Reply via email to