Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/13599#discussion_r160072831 --- Diff: python/pyspark/context.py --- @@ -1023,6 +1039,33 @@ def getConf(self): conf.setAll(self._conf.getAll()) return conf + def install_packages(self, packages, install_driver=True): + """ + install python packages on all executors and driver through pip + :param packages: string for single package or a list of string for multiple packages + :param install_driver: whether to install packages in client + """ + if self._conf.get("spark.pyspark.virtualenv.enabled") != "true": + raise Exception("install_packages can only use called when " + "spark.pyspark.virtualenv.enabled set as true") + if isinstance(packages, basestring): + packages = [packages] + num_executors = int(self._conf.get("spark.executor.instances")) + dummyRDD = self.parallelize(range(num_executors), num_executors) + + def _run_pip(packages, iterator): + import pip + pip.main(["install"] + packages) + + # run it in the main thread. Will do it in a separated thread after + # https://github.com/pypa/pip/issues/2553 is fixed + if install_driver: + import threading --- End diff -- Sorry, maybe I missed something. Do we need to import this?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org