Github user squito commented on a diff in the pull request:
https://github.com/apache/spark/pull/21546#discussion_r213835409
--- Diff: python/pyspark/context.py ---
@@ -494,10 +494,14 @@ def f(split, iterator):
c = list(c) # Make it a list so we can compute its length
batchSize = max(1, min(len(c) // numSlices, self._batchSize or
1024))
serializer = BatchedSerializer(self._unbatched_serializer,
batchSize)
- jrdd = self._serialize_to_jvm(c, numSlices, serializer)
+
+ def reader_func(temp_filename):
+ return self._jvm.PythonRDD.readRDDFromFile(self._jsc,
temp_filename, numSlices)
+
+ jrdd = self._serialize_to_jvm(c, serializer, reader_func)
return RDD(jrdd, self, serializer)
- def _serialize_to_jvm(self, data, parallelism, serializer):
+ def _serialize_to_jvm(self, data, serializer, reader_func):
--- End diff --
Thanks @BryanCutler , sorry I didn't know where to look for those, they
look much better than what I would have added!
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]