Updated Branches: refs/heads/branch-0.8 8b091febd -> daaaee175
Merge pull request #218 from JoshRosen/spark-970-pyspark-unicode-error Fix UnicodeEncodeError in PySpark saveAsTextFile() (SPARK-970) This fixes [SPARK-970](https://spark-project.atlassian.net/browse/SPARK-970), an issue where PySpark's saveAsTextFile() could throw UnicodeEncodeError when called on an RDD of Unicode strings. Please merge this into master and branch-0.8. (cherry picked from commit 8a3475aed66617772f4e98e9f774b109756eb391) Signed-off-by: Reynold Xin <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/daaaee17 Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/daaaee17 Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/daaaee17 Branch: refs/heads/branch-0.8 Commit: daaaee175a6c07115c8eef85611f3717123c43b3 Parents: 8b091fe Author: Reynold Xin <[email protected]> Authored: Tue Dec 3 14:21:40 2013 -0800 Committer: Reynold Xin <[email protected]> Committed: Tue Dec 3 14:22:05 2013 -0800 ---------------------------------------------------------------------- python/pyspark/rdd.py | 5 ++++- python/pyspark/tests.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/daaaee17/python/pyspark/rdd.py ---------------------------------------------------------------------- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 7019fb8..0c599e0 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -598,7 +598,10 @@ class RDD(object): '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n' """ def func(split, iterator): - return (str(x).encode("utf-8") for x in iterator) + for x in iterator: + if not isinstance(x, basestring): + x = unicode(x) + yield x.encode("utf-8") keyed = PipelinedRDD(self, func) keyed._bypass_serializer = True keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path) http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/daaaee17/python/pyspark/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 29d6a12..d3f6c2b 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -19,6 +19,8 @@ Unit tests for PySpark; additional tests are implemented as doctests in individual modules. """ +from fileinput import input +from glob import glob import os import shutil import sys @@ -137,6 +139,19 @@ class TestAddFile(PySparkTestCase): self.assertEqual("Hello World from inside a package!", UserClass().hello()) +class TestRDDFunctions(PySparkTestCase): + + def test_save_as_textfile_with_unicode(self): + # Regression test for SPARK-970 + x = u"\u00A1Hola, mundo!" + data = self.sc.parallelize([x]) + tempFile = NamedTemporaryFile(delete=True) + tempFile.close() + data.saveAsTextFile(tempFile.name) + raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) + self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) + + class TestIO(PySparkTestCase): def test_stdout_redirection(self):
