Github user goldmedal commented on a diff in the pull request: https://github.com/apache/spark/pull/19339#discussion_r141080522 --- Diff: python/pyspark/sql/readwriter.py --- @@ -420,7 +425,29 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine) if isinstance(path, basestring): path = [path] - return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path))) + if type(path) == list: + return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path))) + elif isinstance(path, RDD): + def func(iterator): + for x in iterator: + if not isinstance(x, basestring): + x = unicode(x) + if isinstance(x, unicode): + x = x.encode("utf-8") + yield x + keyed = path.mapPartitions(func) + keyed._bypass_serializer = True + jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString()) + # [SPARK-22112] + # There aren't any jvm api for creating a dataframe from rdd storing csv. --- End diff -- ok let me fix it. thanks :)
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org