@szha I found that training with mx.mod.Module setting MXNET_BACKWARD_DO_MIRROR to 1 takes more GPU memory than Gluon HybridBlock. Because if setting MXNET_BACKWARD_DO_MIRROR to 1, MXNET_USE_FUSION must be also set to 1 because it seems that relu has been fused. Does it mean that Gluon does not need MXNET_BACKWARD_DO_MIRROR? Or we can't generate Symbol from HybridBlock and must write a network with pure symbol API?
I test the memory consuming with the following codes: ```python import mxnet as mx import mxnet.autograd as ag class NaiveDataset(object): def __len__(self): return 10000 def __getitem__(self, idx): if idx % 2 ==0: label = mx.nd.zeros(shape=(1000, )) label[0] = 1 return mx.nd.array(mx.nd.zeros(shape=(3, 224, 224))), label else: label = mx.nd.zeros(shape=(1000, )) label[1] = 1 return mx.nd.array(mx.nd.ones(shape=(3, 224, 224))), label def train_gluon_model_with_module(): import os # os.environ["MXNET_BACKWARD_DO_MIRROR"]="1" # os.environ["MXNET_USE_FUSION"]="0" ctx_list = [mx.gpu(0)] from models.backbones.resnet._resnetv1b import resnet50_v1b net = resnet50_v1b(pretrained=False) # net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False) net.initialize() _ = net(mx.nd.zeros(shape=(1, 3, 224, 224))) arg_params = {} aux_params = {} arg_params_collected = net.collect_params() for k in arg_params_collected: arg_params[k] = arg_params_collected[k].data(mx.cpu()) for k in arg_params_collected: aux_params[k] = arg_params_collected[k].data(mx.cpu()) data = mx.sym.var(name="data") sym = net(data) module = mx.mod.Module(sym, data_names=['data'], label_names=[], context=ctx_list) module.bind(data_shapes=[("data", (len(ctx_list) * 2, 3, 224, 224))]) module.init_params(arg_params=arg_params, aux_params=aux_params, allow_missing=False, allow_extra=True) module.init_optimizer(force_init=True) train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(), batch_size=100, num_workers=8, last_batch="discard", shuffle=True, thread_pool=False) for data_batch in train_loader: module_data_batch = mx.io.DataBatch(data=[data_batch[0], ], label=None) module.forward(module_data_batch, is_train=True) y_hat = module.get_outputs(merge_multi_context=True) label_list = mx.gluon.utils.split_and_load(data_batch[1], ctx_list=ctx_list, batch_axis=0) preds_list = mx.gluon.utils.split_and_load(y_hat[0], ctx_list=ctx_list, batch_axis=0) pred_grad_list = [] for pred, label in zip(preds_list, label_list): # type: mx.nd.NDArray, mx.nd.NDArray pred.attach_grad() label.attach_grad() with ag.record(): pred_log_softmax = mx.nd.log_softmax(pred, axis=1) loss = pred_log_softmax * label * -1 loss.backward() pred_grad_list.append(pred.grad) pred_gradients = mx.nd.concatenate(pred_grad_list, axis=0) module.backward([pred_gradients]) module.update() print(loss.sum().asnumpy()) mx.nd.waitall() def train_gluon_model_with_gluon(): ctx_list = [mx.gpu(0)] net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False) net.initialize() net.collect_params().reset_ctx(ctx_list) net.hybridize(static_alloc=True) trainer = mx.gluon.Trainer( net.collect_params(), # fix batchnorm, fix first stage, etc... 'sgd', { 'learning_rate':1e-2 }, ) train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(), batch_size=100, num_workers=8, last_batch="discard", shuffle=True, thread_pool=False) for data_batch in train_loader: data_list = mx.gluon.utils.split_and_load(data_batch[0], ctx_list=ctx_list, batch_axis=0) label_list = mx.gluon.utils.split_and_load(data_batch[1], ctx_list=ctx_list, batch_axis=0) losses = [] for data, label in zip(data_list, label_list): # type: mx.nd.NDArray, mx.nd.NDArray with ag.record(): y_hat = net(data) pred_log_softmax = mx.nd.log_softmax(y_hat, axis=1) loss = pred_log_softmax * label * -1 losses.append(loss) ag.backward(losses) trainer.step(1) print(loss.sum().asnumpy()) mx.nd.waitall() if __name__ == '__main__': # train_gluon_model_with_module() train_gluon_model_with_gluon() ``` By default train_gluon_model_with_module and train_gluon_model_with_gluon need almost same GPU memory, but if set MXNET_BACKWARD_DO_MIRROR to 1 and set MXNET_USE_FUSION to 0, train_gluon_model_with_module will fail and raise a OOM exception. -- You are receiving this because you were mentioned. Reply to this email directly or view it on GitHub: https://github.com/apache/incubator-mxnet/issues/18800#issuecomment-691611097