Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19349#discussion_r141101690
--- Diff: python/pyspark/serializers.py ---
@@ -251,6 +256,36 @@ def __repr__(self):
return "ArrowPandasSerializer"
+class ArrowStreamPandasSerializer(Serializer):
+ """
+ (De)serializes a vectorized(Apache Arrow) stream.
+ """
+
+ def load_stream(self, stream):
+ import pyarrow as pa
+ reader = pa.open_stream(stream)
+ for batch in reader:
+ table = pa.Table.from_batches([batch])
+ yield [c.to_pandas() for c in table.itercolumns()]
+
+ def dump_stream(self, iterator, stream):
+ import pyarrow as pa
+ writer = None
+ try:
+ for series in iterator:
+ batch = _create_batch(series)
+ if writer is None:
+ write_int(0, stream)
--- End diff --
shall we add a new entry in `SpecialLenths` and use it here instead of `0`?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]