RogerChern commented on issue #8884: forward can't run parallelly using multi-gpus when custom operator using numpy URL: https://github.com/apache/incubator-mxnet/issues/8884#issuecomment-359232845 A MCVE as following runs in a 2GPU setting: ```python import time import mxnet as mx import numpy as np class DebugOperator(mx.operator.CustomOp): def __init__(self, **kwargs): super(DebugOperator, self).__init__() self.pos = kwargs.get("pos", None) def forward(self, is_train, req, in_data, out_data, aux): print("entering %d: %.4f" % (in_data[0][0].context.device_id, time.time())) time.sleep(0.1) self.assign(out_data[0], req[0], 0) print("exiting %d: %.4f" % (in_data[0][0].context.device_id, time.time())) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): self.assign(in_grad[0], req[0], 0) @mx.operator.register("Debug") class DebugProp(mx.operator.CustomOpProp): def __init__(self, **kwargs): super(DebugProp, self).__init__(need_top_grad=False) self._kwargs = kwargs def list_arguments(self): return ['data'] def list_outputs(self): return ['output'] def infer_shape(self, in_shape): return in_shape, [(1, )] def create_operator(self, ctx, shapes, dtypes): return DebugOperator(**self._kwargs) def get_symbol(): data = mx.sym.var("data") label = mx.sym.var("softmax_label") proj = mx.sym.FullyConnected(data, num_hidden=1) debug = mx.sym.Custom(proj, op_type="Debug", name="debug") return mx.sym.Group([debug, label]) if __name__ == "__main__": gpus = [0, 1] sym = get_symbol() mod = mx.module.Module(sym, context=[mx.gpu(i) for i in gpus]) mod.bind(data_shapes=[("data", (len(gpus), 1))], label_shapes=[("softmax_label", (len(gpus), 1))]) data = mx.io.NDArrayIter(data=np.zeros((10000, 1)), label=np.zeros((10000, 1)), batch_size=len(gpus)) mod.fit(data, num_epoch=1, eval_metric=mx.metric.Loss(output_names=["debug_output"])) ``` outputs are: ``` entering 1: 1516523993.4081 exiting 1: 1516523993.5086 entering 0: 1516523993.5088 exiting 0: 1516523993.6092 entering 1: 1516523993.6362 exiting 1: 1516523993.7368 entering 0: 1516523993.7369 exiting 0: 1516523993.8373 entering 1: 1516523993.8394 exiting 1: 1516523993.9398 entering 0: 1516523993.9400 exiting 0: 1516523994.0404 entering 1: 1516523994.0634 exiting 1: 1516523994.1692 entering 0: 1516523994.1694 exiting 0: 1516523994.2698 entering 0: 1516523994.2750 exiting 0: 1516523994.3755 entering 1: 1516523994.3757 exiting 1: 1516523994.4761 entering 0: 1516523994.4873 exiting 0: 1516523994.5877 entering 1: 1516523994.5879 exiting 1: 1516523994.6883 entering 0: 1516523994.6943 exiting 0: 1516523994.7948 ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
