mathewjacob1002 commented on code in PR #41778:
URL: https://github.com/apache/spark/pull/41778#discussion_r1253709297
##########
python/pyspark/ml/torch/deepspeed/deepspeed_distributer.py:
##########
@@ -0,0 +1,167 @@
+import json
+import os
+import subprocess
+import tempfile
+from typing import (
+ Union,
+ Callable,
+ List,
+ Dict,
+ Optional,
+ Any,
+)
+from pyspark.ml.torch.distributor import Distributor, TorchDistributor
+
+
+def write_to_location(location: str, content: str) -> None:
+ os.makedirs(os.path.dirname(location), exist_ok=True)
+ with open(location, "a") as f:
+ f.write(content)
+
+
+class DeepspeedDistributor(Distributor):
+ """The user must ensure that their cluster is ssh-keychained and that
deepspeed is able to use ssh to coordinate among the nodes for the distributed
training"""
+
+ HOME = os.path.expanduser("~")
+ HOSTFILE = f"/{HOME}/hostfile"
+
+ def __init__(
+ self,
+ num_processes: int = 1,
+ local_mode: bool = True,
+ use_gpu: bool = True,
+ deepspeed_config=None,
+ ):
+ super().__init__(num_processes, local_mode, use_gpu)
+ self.deepspeed_config = deepspeed_config
+ self.temp_deepspeed_fname = None
+ self.input_params = self._create_input_params()
+ self.worker_hosts = self._setup_hostfile_info()
+ self.setup_env()
+
+ def _get_gpus_on_node(self, executor_ip: str):
+ # TODO: ask Ricky, Lu, or Maddie if this is the best way to get the
GPU information of a particular worker node
+ command = f"ssh {executor_ip} nvidia-smi -L | grep GPU | wc -l" #
pyspark doesn't support this out of the box for some reason, so sadge
Review Comment:
Rather than this, are the executors guaranteed to all be the same
configuration? If so, we can use the spark environment variable
int(spark.conf.get("spark.executor.resource.gpu.amount")). What do you think?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]