Github user srowen commented on a diff in the pull request:
https://github.com/apache/spark/pull/22404#discussion_r217155746
--- Diff: python/pyspark/context.py ---
@@ -499,19 +506,32 @@ def f(split, iterator):
def _serialize_to_jvm(self, data, parallelism, serializer):
"""
- Calling the Java parallelize() method with an ArrayList is too
slow,
- because it sends O(n) Py4J commands. As an alternative, serialized
- objects are written to a file and loaded through textFile().
- """
- tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
- try:
- serializer.dump_stream(data, tempFile)
- tempFile.close()
- readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
- return readRDDFromFile(self._jsc, tempFile.name, parallelism)
- finally:
- # readRDDFromFile eagerily reads the file so we can delete
right after.
- os.unlink(tempFile.name)
+ Using py4j to send a large dataset to the jvm is really slow, so
we use either a file
+ or a socket if we have encryption enabled.
+ """
+ if self._encryption_enabled:
+ # with encryption, we open a server in java and send the data
directly
+ server = self._jvm.PythonParallelizeServer(self._jsc.sc(),
parallelism)
+ (sock_file, _) = local_connect_and_auth(server.port(),
server.secret())
+ chunked_out = ChunkedStream(sock_file, 8192)
+ serializer.dump_stream(data, chunked_out)
+ chunked_out.close()
+ # this call will block until the server has read all the data
and processed it (or
+ # throws an exception)
+ r = server.getResult()
+ return r
--- End diff --
Nit, do you want to just `return server.getResult()` in cases like this?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]