Github user BryanCutler commented on a diff in the pull request:
https://github.com/apache/spark/pull/21546#discussion_r213819766
--- Diff: python/pyspark/context.py ---
@@ -494,10 +494,14 @@ def f(split, iterator):
c = list(c) # Make it a list so we can compute its length
batchSize = max(1, min(len(c) // numSlices, self._batchSize or
1024))
serializer = BatchedSerializer(self._unbatched_serializer,
batchSize)
- jrdd = self._serialize_to_jvm(c, numSlices, serializer)
+
+ def reader_func(temp_filename):
+ return self._jvm.PythonRDD.readRDDFromFile(self._jsc,
temp_filename, numSlices)
+
+ jrdd = self._serialize_to_jvm(c, serializer, reader_func)
return RDD(jrdd, self, serializer)
- def _serialize_to_jvm(self, data, parallelism, serializer):
+ def _serialize_to_jvm(self, data, serializer, reader_func):
--- End diff --
I made https://issues.apache.org/jira/browse/SPARK-25272 which will give a
more clear output that the ArrowTests were run.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]