RogerChern commented on issue #8884: forward can't run parallelly using 
multi-gpus when custom operator using numpy  
URL: 
https://github.com/apache/incubator-mxnet/issues/8884#issuecomment-359232845
 
 
   A MCVE as following runs in a 2GPU setting:
   
   ```python
   import time
   import mxnet as mx
   import numpy as np
   
   
   class DebugOperator(mx.operator.CustomOp):
       def __init__(self, **kwargs):
           super(DebugOperator, self).__init__()
           self.pos = kwargs.get("pos", None)
   
       def forward(self, is_train, req, in_data, out_data, aux):
           print("entering %d: %.4f" % (in_data[0][0].context.device_id, 
time.time()))
           time.sleep(0.1)
           self.assign(out_data[0], req[0], 0)
           print("exiting %d: %.4f" % (in_data[0][0].context.device_id, 
time.time()))
   
       def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
           self.assign(in_grad[0], req[0], 0)
   
   
   @mx.operator.register("Debug")
   class DebugProp(mx.operator.CustomOpProp):
       def __init__(self, **kwargs):
           super(DebugProp, self).__init__(need_top_grad=False)
           self._kwargs = kwargs
   
       def list_arguments(self):
           return ['data']
   
       def list_outputs(self):
           return ['output']
   
       def infer_shape(self, in_shape):
           return in_shape, [(1, )]
   
       def create_operator(self, ctx, shapes, dtypes):
           return DebugOperator(**self._kwargs)
   
   
   def get_symbol():
       data = mx.sym.var("data")
       label = mx.sym.var("softmax_label")
       proj = mx.sym.FullyConnected(data, num_hidden=1)
       debug = mx.sym.Custom(proj, op_type="Debug", name="debug")
       return mx.sym.Group([debug, label])
   
   
   if __name__ == "__main__":
       gpus = [0, 1]
       sym = get_symbol()
       mod = mx.module.Module(sym, context=[mx.gpu(i) for i in gpus])
       mod.bind(data_shapes=[("data", (len(gpus), 1))], 
label_shapes=[("softmax_label", (len(gpus), 1))])
       data = mx.io.NDArrayIter(data=np.zeros((10000, 1)), 
label=np.zeros((10000, 1)), batch_size=len(gpus))
       mod.fit(data, num_epoch=1, 
eval_metric=mx.metric.Loss(output_names=["debug_output"]))
   ```
   
   outputs are:
   ```
   entering 1: 1516523993.4081
   exiting 1: 1516523993.5086
   entering 0: 1516523993.5088
   exiting 0: 1516523993.6092
   entering 1: 1516523993.6362
   exiting 1: 1516523993.7368
   entering 0: 1516523993.7369
   exiting 0: 1516523993.8373
   entering 1: 1516523993.8394
   exiting 1: 1516523993.9398
   entering 0: 1516523993.9400
   exiting 0: 1516523994.0404
   entering 1: 1516523994.0634
   exiting 1: 1516523994.1692
   entering 0: 1516523994.1694
   exiting 0: 1516523994.2698
   entering 0: 1516523994.2750
   exiting 0: 1516523994.3755
   entering 1: 1516523994.3757
   exiting 1: 1516523994.4761
   entering 0: 1516523994.4873
   exiting 0: 1516523994.5877
   entering 1: 1516523994.5879
   exiting 1: 1516523994.6883
   entering 0: 1516523994.6943
   exiting 0: 1516523994.7948
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to