Hi,
I am doing some basic preprocessing in pyspark (local mode as follows):
files = [ input files]
def read(filename,sc):
#process file
return rdd
if __name__ =="__main__":
conf = SparkConf()
conf.setMaster('local')
sc = SparkContext(conf =conf)
sc.setCheckpointDir(root+"temp/")
data = sc.parallelize([])
for i,f in enumerate(files):
data = data.union(read(f,sc))
if i ==20:
data.checkpoint()
data.count()
if i == 500:break
#print data.count()
#rdd_1 = read(files[0],sc)
data.saveAsTextFile(root+"output/")
But I see this error:
keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
File
"/Users/ping/Desktop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py",
line 538, in __call__
File
"/Users/ping/Desktop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py",
line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling
o9564.saveAsTextFile.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task
serialization failed: java.lang.StackOverflowError
java.io.Bits.putInt(Bits.java:93)
java.io.ObjectOutputStream$BlockDataOutputStream.writeInt(ObjectOutputStream.java:1927)