xinedison opened a new issue #10382: Does memonger work for gluon to save memory? URL: https://github.com/apache/incubator-mxnet/issues/10382 ## Description I want to reduce gpu memory costing when using gluon, I tryed MXNet memonger but it did not work for me, After that I setting os.environ['MXNET_BACKWARD_DO_MIRROR'] = '1', But it not work for me too. ## Environment info (Required) ``` ----------Python Info---------- ('Version :', '2.7.5') ('Compiler :', 'GCC 4.8.5 20150623 (Red Hat 4.8.5-11)') ('Build :', ('default', 'Nov 6 2016 00:28:07')) ('Arch :', ('64bit', 'ELF')) ------------Pip Info----------- ('Version :', '9.0.1') ('Directory :', '/usr/lib/python2.7/site-packages/pip') ----------MXNet Info----------- ('Version :', '1.2.0') ('Directory :', '/home/yinghuang/incubator-mxnet-newest/python/mxnet') Hashtag not found. Not installed from pre-built package. ----------System Info---------- ('Platform :', 'Linux-3.10.0-327.22.2.el7.x86_64-x86_64-with-centos-7.2.1511-Core') ('system :', 'Linux') ('node :', 'gz-open-gpu-c117') ('release :', '3.10.0-327.22.2.el7.x86_64') ('version :', '#1 SMP Thu Jun 23 17:05:11 UTC 2016') ----------Hardware Info---------- ('machine :', 'x86_64') ('processor :', 'x86_64') Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 32 On-line CPU(s) list: 0-31 Thread(s) per core: 2 Core(s) per socket: 8 座: 2 NUMA 节点: 2 厂商 ID: GenuineIntel CPU 系列: 6 型号: 62 型号名称: Genuine Intel(R) CPU @ 2.80GHz 步进: 2 CPU MHz: 1706.250 BogoMIPS: 5617.25 虚拟化: VT-x L1d 缓存: 32K L1i 缓存: 32K L2 缓存: 256K L3 缓存: 25600K NUMA 节点0 CPU: 0-7,16-23 NUMA 节点1 CPU: 8-15,24-31 ----------Network Test---------- Setting timeout: 10 Timing for MXNet: https://github.com/apache/incubator-mxnet, DNS: 0.0234 sec, LOAD: 1.4669 sec. Timing for PYPI: https://pypi.python.org/pypi/pip, DNS: 0.0253 sec, LOAD: 0.4839 sec. Timing for FashionMNIST: https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz, DNS: 0.3580 sec, LOAD: 2.5292 sec. Timing for Conda: https://repo.continuum.io/pkgs/free/, DNS: 0.0229 sec, LOAD: 0.8054 sec. Timing for Gluon Tutorial(en): http://gluon.mxnet.io, DNS: 0.7395 sec, LOAD: 1.3768 sec. Timing for Gluon Tutorial(cn): https://zh.gluon.ai, DNS: 0.0230 sec, LOAD: 3.4856 sec. ``` ## question definition I am using python 2 and newest mxnet gluon 3D convolution to do video action recognition. I want gpu memory to be effiently used, 1) so I try [[memonger]](https://github.com/dmlc/mxnet-memonger.git) to optimize the sym the hybridblock generated, but with no effect after hard working. 2) Then I try to save memory by only setting the environment variable MXNET_BACKWARD_DO_MIRROR to be '1', the memory not reduced too. Can someone show me what I have missed for memory saving. ## Steps to reproduce here is my code of my network import sys import os mxnet_path = os.path.expanduser('~') + '/incubator-mxnet-newest' sys.path.insert(0, os.path.abspath(os.path.join(mxnet_path, "python"))) import mxnet as mx from mxnet.gluon import nn from mxnet import nd from mxnet.gluon.block import _flatten,_regroup import memonger def bn_relu_conv(ks, nout, stride, pad, name=None): layer = nn.HybridSequential() layer.add(nn.BatchNorm()) layer.add(nn.Activation('relu')) layer.add(nn.Conv3D(channels=nout, kernel_size=ks, padding=pad, strides=stride)) return layer def bn_relu_block(growth_rate): layer = nn.HybridSequential() layer.add(bn_relu_conv(1, nout=growth_rate, stride=1, pad=0)) layer.add(bn_relu_conv(3, nout=growth_rate, stride=1, pad=1)) return layer def conv_act_layer(channels, kernel=(1,1,1) , pad=(0,0,0), stride=(1,1,1), act_type="relu", use_batchnorm=False): layer = nn.HybridSequential() layer.add(nn.Conv3D(channels=channels, kernel_size=kernel, padding=pad, strides=stride)) layer.add(nn.BatchNorm()) layer.add(nn.Activation(act_type)) return layer def transition(channels): transition_layer = nn.HybridSequential() transition_layer.add(bn_relu_conv(ks=1, nout=channels, stride=1, pad=0)) transition_layer.add(nn.MaxPool3D(pool_size=2, strides=2)) return transition_layer def transition_w_o_pooling(channels): layer = bn_relu_conv(ks=1, nout=channels, stride=1, pad=0) return layer class DsodBlock(nn.HybridBlock): def __init__(self, layers, growth_rate, use_memonger=False, **kwargs): super(DsodBlock, self).__init__(**kwargs) self.use_memonger = use_memonger self.net = nn.HybridSequential() for i in range(layers): lay = bn_relu_block(growth_rate) self.net.add(lay) def hybrid_forward(self, F, x): for idx, layer in enumerate(self.net): out = layer(x) x = F.concat(x, out, dim=1) if self.use_memonger and (idx % 2 == 0): #print("use memonger true") x._set_attr(mirror_stage='True') return x class DenseNet(nn.HybridBlock): def __init__(self, net_def, num_classes, growth_rate, use_memonger=False, batch_size=32, input_depth=16, input_size=112, **kwargs): super(DenseNet, self).__init__(**kwargs) channels = 128 self.use_memonger = use_memonger self.batch_size = batch_size self.input_depth = input_depth self.input_size = input_size #assert self.use_memonger with self.name_scope(): self.features = nn.HybridSequential(prefix='') self.features.add(conv_act_layer(64, kernel=3, pad=1, stride=2, act_type="relu", use_batchnorm=True)) self.features.add(conv_act_layer(64, kernel=3, pad=1, stride=1, act_type="relu", use_batchnorm=True)) self.features.add(conv_act_layer(128, kernel=3, pad=1, stride=1, act_type="relu", use_batchnorm=True)) self.features.add(nn.MaxPool3D(pool_size=2, strides=2, padding=1)) for i,(dense_layers,transition_fun) in enumerate(net_def): self.features.add(DsodBlock(layers=dense_layers, growth_rate=growth_rate, use_memonger=use_memonger)) channels += growth_rate*dense_layers self.features.add(transition_fun(channels)) self.features.add(nn.BatchNorm()) self.features.add(nn.Activation('relu')) self.features.add(nn.GlobalAvgPool3D()) self.features.add(nn.Flatten()) self.output = nn.Dense(num_classes, in_units=channels) def _get_graph(self, *args): #assert False if not self._cached_graph: args, self._in_format = _flatten(args) if len(args) > 1: inputs = [mx.symbol.var('data%d'%i) for i in range(len(args))] else: inputs = [mx.symbol.var('data')] grouped_inputs = _regroup(inputs, self._in_format)[0] params = {i: j.var() for i, j in self._reg_params.items()} with self.name_scope(): out = self.hybrid_forward(mx.symbol, *grouped_inputs, **params) # pylint: disable=no-value-for-parameter out, self._out_format = _flatten(out) assert len(out) == 1 if self.use_memonger: assert len(inputs) == 1 out = memonger.search_plan(out[0], data=(self.batch_size, 3, self.input_depth, self.input_size, self.input_size)) out = [out] self._cached_graph = inputs, out[0] #mx.symbol.Group(out) return self._cached_graph def hybrid_forward(self, F, x): x = self.features(x) x = self.output(x) return x def dsod_net(net_def, num_classes, growth_rate=64): growth_rate = growth_rate channels = 128 net = nn.HybridSequential() with net.name_scope(): ## dsod backbone net.add(conv_act_layer(64, kernel=3, pad=1, stride=2, act_type="relu", use_batchnorm=True)) net.add(conv_act_layer(64, kernel=3, pad=1, stride=1, act_type="relu", use_batchnorm=True)) net.add(conv_act_layer(128, kernel=3, pad=1, stride=1, act_type="relu", use_batchnorm=True)) net.add(nn.MaxPool3D(pool_size=2, strides=2, padding=1)) for i,(dense_layers,transition_fun) in enumerate(net_def): net.add(DsodBlock(layers=dense_layers, growth_rate=growth_rate)) channels += growth_rate*dense_layers net.add(transition_fun(channels)) classifier = nn.HybridSequential() classifier.add(nn.BatchNorm()) classifier.add(nn.Activation('relu')) classifier.add(nn.GlobalAvgPool3D()) classifier.add(nn.Flatten()) classifier.add(nn.Dense(num_classes)) net.add(classifier) return net def dsod_net_v2(net_def, num_classes, growth_rate=64, use_memonger=False, **kwargs): net = DenseNet(net_def, num_classes, growth_rate, use_memonger, **kwargs) return net def get_net(net_depth, num_classes, hybridize=True, growth_rate=64, **kwargs): densenet_spec = {30:[(6,transition), (8,transition), (8,transition_w_o_pooling), (8,transition_w_o_pooling)], 22:[(6,transition), (8,transition), (8,transition_w_o_pooling)]} net_def = densenet_spec[net_depth] #net = dsod_net(net_def, num_classes, growth_rate) net = dsod_net_v2(net_def, num_classes, growth_rate, False, **kwargs) if hybridize: net.hybridize() return net if __name__ == '__main__': dsod = get_net(22, 101, True, 32) #print dsod dsod.initialize(ctx=mx.gpu(4)) x = mx.nd.ones((32,3,16,112,112), ctx=mx.gpu(4)) res = dsod(x) #print res.shape #print res
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services