Github user holdenk commented on a diff in the pull request:
https://github.com/apache/spark/pull/13599#discussion_r162856254
--- Diff: python/pyspark/context.py ---
@@ -1023,6 +1032,35 @@ def getConf(self):
conf.setAll(self._conf.getAll())
return conf
+ def install_packages(self, packages, install_driver=True):
+ """
+ install python packages on all executors and driver through pip.
pip will be installed
+ by default no matter using native virtualenv or conda. So it is
guaranteed that pip is
+ available if virtualenv is enabled.
+ :param packages: string for single package or a list of string for
multiple packages
+ :param install_driver: whether to install packages in client
+ """
+ if self._conf.get("spark.pyspark.virtualenv.enabled") != "true":
+ raise RuntimeError("install_packages can only use called when "
+ "spark.pyspark.virtualenv.enabled set as
true")
+ if isinstance(packages, basestring):
+ packages = [packages]
+ # seems statusTracker.getExecutorInfos() will return driver +
exeuctors, so -1 here.
+ num_executors =
len(self._jsc.sc().statusTracker().getExecutorInfos()) - 1
+ dummyRDD = self.parallelize(range(num_executors), num_executors)
+
+ def _run_pip(packages, iterator):
+ import pip
+ pip.main(["install"] + packages)
+
+ # run it in the main thread. Will do it in a separated thread after
+ # https://github.com/pypa/pip/issues/2553 is fixed
+ if install_driver:
+ _run_pip(packages, None)
+
+ import functools
+ dummyRDD.foreachPartition(functools.partial(_run_pip, packages))
--- End diff --
@zjffdu No its it not, hard -1.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]