Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/19349#discussion_r141250251 --- Diff: python/pyspark/serializers.py --- @@ -251,6 +256,36 @@ def __repr__(self): return "ArrowPandasSerializer" +class ArrowStreamPandasSerializer(Serializer): + """ + Serializes Pandas.Series as Arrow data with Arrow streaming format. + """ + + def load_stream(self, stream): + import pyarrow as pa + reader = pa.open_stream(stream) + for batch in reader: + table = pa.Table.from_batches([batch]) + yield [c.to_pandas() for c in table.itercolumns()] + + def dump_stream(self, iterator, stream): --- End diff -- Sure, I'll add comments.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org