@szha  I found that training with mx.mod.Module setting 
MXNET_BACKWARD_DO_MIRROR to 1 takes more GPU memory than Gluon HybridBlock. 
Because if setting MXNET_BACKWARD_DO_MIRROR to 1, MXNET_USE_FUSION must be also 
set to 1 because it seems that relu has been fused. Does it mean that Gluon 
does not need MXNET_BACKWARD_DO_MIRROR? Or we can't generate Symbol from 
HybridBlock and must write a network with pure symbol API?

I test the memory consuming with the following codes:
```python
import mxnet as mx
import mxnet.autograd as ag


class NaiveDataset(object):
    def __len__(self):
        return 10000

    def __getitem__(self, idx):
        if idx % 2 ==0:
            label = mx.nd.zeros(shape=(1000, ))
            label[0] = 1
            return mx.nd.array(mx.nd.zeros(shape=(3, 224, 224))), label
        else:
            label = mx.nd.zeros(shape=(1000, ))
            label[1] = 1
            return mx.nd.array(mx.nd.ones(shape=(3, 224, 224))), label


def train_gluon_model_with_module():
    import os
    # os.environ["MXNET_BACKWARD_DO_MIRROR"]="1"
    # os.environ["MXNET_USE_FUSION"]="0"
    ctx_list = [mx.gpu(0)]
    from models.backbones.resnet._resnetv1b import resnet50_v1b
    net = resnet50_v1b(pretrained=False)
    # net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False)
    net.initialize()
    _ = net(mx.nd.zeros(shape=(1, 3, 224, 224)))
    arg_params = {}
    aux_params = {}
    arg_params_collected = net.collect_params()
    for k in arg_params_collected:
        arg_params[k] = arg_params_collected[k].data(mx.cpu())
    for k in arg_params_collected:
        aux_params[k] = arg_params_collected[k].data(mx.cpu())

    data = mx.sym.var(name="data")
    sym = net(data)
    module = mx.mod.Module(sym, data_names=['data'], label_names=[], 
context=ctx_list)
    module.bind(data_shapes=[("data", (len(ctx_list) * 2, 3, 224, 224))])
    module.init_params(arg_params=arg_params, aux_params=aux_params, 
allow_missing=False, allow_extra=True)
    module.init_optimizer(force_init=True)
    train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(), 
batch_size=100,
                                            num_workers=8, 
last_batch="discard", shuffle=True,
                                            thread_pool=False)
    for data_batch in train_loader:
        module_data_batch = mx.io.DataBatch(data=[data_batch[0], ], label=None)
        module.forward(module_data_batch, is_train=True)
        y_hat = module.get_outputs(merge_multi_context=True)
        label_list = mx.gluon.utils.split_and_load(data_batch[1], 
ctx_list=ctx_list, batch_axis=0)
        preds_list = mx.gluon.utils.split_and_load(y_hat[0], ctx_list=ctx_list, 
batch_axis=0)
        pred_grad_list = []
        for pred, label in zip(preds_list, label_list):  # type: mx.nd.NDArray, 
mx.nd.NDArray
            pred.attach_grad()
            label.attach_grad()
            with ag.record():
                pred_log_softmax = mx.nd.log_softmax(pred,  axis=1)
                loss = pred_log_softmax * label * -1
            loss.backward()
            pred_grad_list.append(pred.grad)
        pred_gradients = mx.nd.concatenate(pred_grad_list, axis=0)
        module.backward([pred_gradients])
        module.update()
        print(loss.sum().asnumpy())
        mx.nd.waitall()


def train_gluon_model_with_gluon():
    ctx_list = [mx.gpu(0)]
    net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False)
    net.initialize()
    net.collect_params().reset_ctx(ctx_list)
    net.hybridize(static_alloc=True)
    trainer = mx.gluon.Trainer(
        net.collect_params(),  # fix batchnorm, fix first stage, etc...
        'sgd',
        {
            'learning_rate':1e-2
         },
    )

    train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(), 
batch_size=100,
                                            num_workers=8, 
last_batch="discard", shuffle=True,
                                            thread_pool=False)
    for data_batch in train_loader:
        data_list = mx.gluon.utils.split_and_load(data_batch[0], 
ctx_list=ctx_list, batch_axis=0)
        label_list = mx.gluon.utils.split_and_load(data_batch[1], 
ctx_list=ctx_list, batch_axis=0)
        losses = []
        for data, label in zip(data_list, label_list):  # type: mx.nd.NDArray, 
mx.nd.NDArray
            with ag.record():
                y_hat = net(data)
                pred_log_softmax = mx.nd.log_softmax(y_hat,  axis=1)
                loss = pred_log_softmax * label * -1
            losses.append(loss)
        ag.backward(losses)
        trainer.step(1)
        print(loss.sum().asnumpy())
        mx.nd.waitall()


if __name__ == '__main__':
    # train_gluon_model_with_module()
    train_gluon_model_with_gluon()

```

By default train_gluon_model_with_module and train_gluon_model_with_gluon need 
almost same GPU memory, but if  set MXNET_BACKWARD_DO_MIRROR to 1 and set 
MXNET_USE_FUSION to 0, train_gluon_model_with_module will fail and raise a OOM 
exception. 

-- 
You are receiving this because you were mentioned.
Reply to this email directly or view it on GitHub:
https://github.com/apache/incubator-mxnet/issues/18800#issuecomment-691611097

Reply via email to