[ https://issues.apache.org/jira/browse/SPARK-21753?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16136180#comment-16136180 ]
Hyukjin Kwon commented on SPARK-21753: -------------------------------------- I merged her PR and double checked if it works: Before: {code} PYSPARK_PYTHON=pypy ./bin/spark-submit examples/src/main/python/pi.py ... PicklingError: Could not serialize object: AttributeError: 'builtin-code' object has no attribute 'co_filename' {code} After: {code} PYSPARK_PYTHON=pypy ./bin/spark-submit examples/src/main/python/pi.py ... Pi is roughly 3.137520 {code} > running pi example with pypy on spark fails to serialize > --------------------------------------------------------- > > Key: SPARK-21753 > URL: https://issues.apache.org/jira/browse/SPARK-21753 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 2.1.1 > Reporter: Thomas Graves > > I'm trying to run the pi example > (https://github.com/apache/spark/blob/master/examples/src/main/python/pi.py) > on pyspark using pypy 2.5.1 but everything I've tried results in a > serialization error: > Traceback (most recent call last): > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 147, > in dump > return Pickler.dump(self, obj) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 224, in dump > self.save(obj) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 562, in save_tuple > save(element) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 254, > in save_function > self.save_function_tuple(obj) > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 291, > in save_function_tuple > save((code, closure, base_globals)) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 548, in save_tuple > save(element) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 600, in save_list > self._batch_appends(iter(obj)) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 633, in _batch_appends > save(x) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 254, > in save_function > self.save_function_tuple(obj) > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 291, > in save_function_tuple > save((code, closure, base_globals)) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 548, in save_tuple > save(element) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 600, in save_list > self._batch_appends(iter(obj)) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 636, in _batch_appends > save(tmp[0]) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 248, > in save_function > self.save_function_tuple(obj) > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 296, > in save_function_tuple > save(f_globals) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 653, in save_dict > self._batch_setitems(obj.iteritems()) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 690, in _batch_setitems > save(v) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 447, > in save_instancemethod > obj=obj) > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 581, > in save_reduce > save(args) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 548, in save_tuple > save(element) > File "//home/tgraves/pypy-my-own-package-name/lib-python/2.7/pickle.py", > line 286, in save > f(self, obj) # Call unbound method with explicit self > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 246, > in save_function > if islambda(obj) or obj.__code__.co_filename == '<stdin>' or themodule is > None: > AttributeError: 'builtin-code' object has no attribute 'co_filename' > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "/home/tgraves/y-spark-git/python/pyspark/rdd.py", line 834, in reduce > vals = self.mapPartitions(func).collect() > File "/home/tgraves/y-spark-git/python/pyspark/rdd.py", line 808, in collect > port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) > File "/home/tgraves/y-spark-git/python/pyspark/rdd.py", line 2440, in _jrdd > self._jrdd_deserializer, profiler) > File "/home/tgraves/y-spark-git/python/pyspark/rdd.py", line 2373, in > _wrap_function > pickled_command, broadcast_vars, env, includes = > _prepare_for_python_RDD(sc, command) > File "/home/tgraves/y-spark-git/python/pyspark/rdd.py", line 2359, in > _prepare_for_python_RDD > pickled_command = ser.dumps(command) > File "/home/tgraves/y-spark-git/python/pyspark/serializers.py", line 460, > in dumps > return cloudpickle.dumps(obj, 2) > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 703, > in dumps > cp.dump(obj) > File "/home/tgraves/y-spark-git/python/pyspark/cloudpickle.py", line 160, > in dump > raise pickle.PicklingError(msg) > It looks like the issue is with serializing random(). If you remove random() > from the function then everything works fine. > I'm just running > PYSPARK_PYTHON=//home/tgraves/pypy-my-own-package-name/bin/pypy ./bin/pyspark > I've tried multiple versions of pypy from 2.5.1 to 5.8.0. I tried the > portable version as well as built pypy from source. > If it works for others perhaps I have a setup issue, any hints on that would > be appreciated. -- This message was sent by Atlassian JIRA (v6.4.14#64029) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org