[GitHub] [spark] lu-wang-dl commented on a diff in pull request #39267: [WIP][SPARK-41592][PYTHON][ML] Pytorch file Distributed Training

GitBox Thu, 12 Jan 2023 14:04:25 -0800


lu-wang-dl commented on code in PR #39267:
URL: https://github.com/apache/spark/pull/39267#discussion_r1068713107



##########
python/pyspark/ml/torch/distributor.py:
##########
@@ -428,6 +432,84 @@ def _run_local_training(
 
         return output
 
+    def _get_spark_task_program(
+        self, framework_wrapper_fn: Optional[Callable], train_fn: 
Union[Callable, str], *args: Any
+    ) -> Callable:
+        num_processes = self.num_processes
+        num_tasks = self.num_tasks
+        use_gpu = self.use_gpu
+        input_params = self.input_params
+
+        # Spark task program
+        def wrapped_train_fn(_):  # type: ignore[no-untyped-def]
+            import os
+            from pyspark import BarrierTaskContext
+
+            CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
+
+            # The idea of setting the random port to 0 doesn't seem to work?
+            def get_free_port(address: str) -> int:
+                import socket
+                import random
+
+                while True:
+                    port = random.randint(32768, 61000)
+                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                    if not (sock.connect_ex((address, port)) == 0):
+                        return port
+
+            def set_torch_config(context: "BarrierTaskContext") -> None:
+                addrs = [e.address.split(":")[0] for e in 
context.getTaskInfos()]
+
+                os.environ["MASTER_ADDR"] = str(addrs[0])
+                os.environ["MASTER_PORT"] = str(get_free_port(addrs[0]))
+                os.environ["WORLD_SIZE"] = str(num_processes)
+                os.environ["NODE_RANK"] = str(context.partitionId())
+                os.environ["RANK"] = str(context.partitionId())
+
+            def set_gpus(context: "BarrierTaskContext") -> None:
+                gpus_owned = get_gpus_owned(context)
+
+                my_num_gpus = (num_processes // num_tasks) + (
+                    context.partitionId() < (num_processes % num_tasks)
+                )
+                gpu_addresses = [str(e) for e in random.sample(gpus_owned, 
my_num_gpus)]
+                os.environ[CUDA_VISIBLE_DEVICES] = ",".join(gpu_addresses)
+
+            context = BarrierTaskContext.get()
+
+            if use_gpu:
+                set_gpus(context)
+            else:
+                os.environ[CUDA_VISIBLE_DEVICES] = ""
+            set_torch_config(context)
+
+            output = framework_wrapper_fn(input_params, train_fn, *args)
+
+            if context.partitionId() == 0:
+                return [output]
+            return [None]
+
+        return wrapped_train_fn
+
+    def _run_distributed_training(
+        self,
+        framework_wrapper_fn: Optional[Callable],
+        train_fn: Union[Callable, str],
+        *args: Any,
+    ) -> Optional[Any]:
+        if not framework_wrapper_fn:
+            raise RuntimeError("Unknown combination of parameters")
+        spark_task_program = 
self._get_spark_task_program(framework_wrapper_fn, train_fn, *args)

Review Comment:
   Why not just define the function here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] lu-wang-dl commented on a diff in pull request #39267: [WIP][SPARK-41592][PYTHON][ML] Pytorch file Distributed Training

Reply via email to