xinedison opened a new issue #10382: Does memonger work for gluon to save 
memory?
URL: https://github.com/apache/incubator-mxnet/issues/10382
 
 
   
   ## Description
   I want to reduce gpu memory costing when using gluon, I tryed MXNet memonger 
but it did not work for me, After that I setting 
os.environ['MXNET_BACKWARD_DO_MIRROR'] = '1', But it not work for me too.
   
   ## Environment info (Required)
   
   ```
   ----------Python Info----------
   ('Version      :', '2.7.5')
   ('Compiler     :', 'GCC 4.8.5 20150623 (Red Hat 4.8.5-11)')
   ('Build        :', ('default', 'Nov  6 2016 00:28:07'))
   ('Arch         :', ('64bit', 'ELF'))
   ------------Pip Info-----------
   ('Version      :', '9.0.1')
   ('Directory    :', '/usr/lib/python2.7/site-packages/pip')
   ----------MXNet Info-----------
   ('Version      :', '1.2.0')
   ('Directory    :', '/home/yinghuang/incubator-mxnet-newest/python/mxnet')
   Hashtag not found. Not installed from pre-built package.
   ----------System Info----------
   ('Platform     :', 
'Linux-3.10.0-327.22.2.el7.x86_64-x86_64-with-centos-7.2.1511-Core')
   ('system       :', 'Linux')
   ('node         :', 'gz-open-gpu-c117')
   ('release      :', '3.10.0-327.22.2.el7.x86_64')
   ('version      :', '#1 SMP Thu Jun 23 17:05:11 UTC 2016')
   ----------Hardware Info----------
   ('machine      :', 'x86_64')
   ('processor    :', 'x86_64')
   Architecture:          x86_64
   CPU op-mode(s):        32-bit, 64-bit
   Byte Order:            Little Endian
   CPU(s):                32
   On-line CPU(s) list:   0-31
   Thread(s) per core:    2
   Core(s) per socket:    8
   座:                 2
   NUMA 节点:         2
   厂商 ID:           GenuineIntel
   CPU 系列:          6
   型号:              62
   型号名称:        Genuine Intel(R) CPU  @ 2.80GHz
   步进:              2
   CPU MHz:             1706.250
   BogoMIPS:            5617.25
   虚拟化:           VT-x
   L1d 缓存:          32K
   L1i 缓存:          32K
   L2 缓存:           256K
   L3 缓存:           25600K
   NUMA 节点0 CPU:    0-7,16-23
   NUMA 节点1 CPU:    8-15,24-31
   ----------Network Test----------
   Setting timeout: 10
   Timing for MXNet: https://github.com/apache/incubator-mxnet, DNS: 0.0234 
sec, LOAD: 1.4669 sec.
   Timing for PYPI: https://pypi.python.org/pypi/pip, DNS: 0.0253 sec, LOAD: 
0.4839 sec.
   Timing for FashionMNIST: 
https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz,
 DNS: 0.3580 sec, LOAD: 2.5292 sec.
   Timing for Conda: https://repo.continuum.io/pkgs/free/, DNS: 0.0229 sec, 
LOAD: 0.8054 sec.
   Timing for Gluon Tutorial(en): http://gluon.mxnet.io, DNS: 0.7395 sec, LOAD: 
1.3768 sec.
   Timing for Gluon Tutorial(cn): https://zh.gluon.ai, DNS: 0.0230 sec, LOAD: 
3.4856 sec.
   
   ```
   ## question definition 
   
   I am using python 2 and newest mxnet gluon 3D convolution to do video action 
recognition. I want gpu memory to be effiently used, 1) so I try 
[[memonger]](https://github.com/dmlc/mxnet-memonger.git) to optimize the sym 
the hybridblock generated, but with no effect after hard working. 2) Then I try 
to save memory by only setting the environment variable 
MXNET_BACKWARD_DO_MIRROR to be '1', the memory not reduced too. Can someone 
show me what I have missed for memory saving. 
   
   
   ## Steps to reproduce
   
   here is my code of my network 
   
   
   import sys
   import os
   
   
   mxnet_path = os.path.expanduser('~') + '/incubator-mxnet-newest'
   sys.path.insert(0, os.path.abspath(os.path.join(mxnet_path, "python")))
   import mxnet as mx
   from mxnet.gluon import nn
   from mxnet import nd
   from mxnet.gluon.block import _flatten,_regroup
   
   import memonger
   
   
   def bn_relu_conv(ks, nout, stride, pad,  name=None):
       layer = nn.HybridSequential()
       layer.add(nn.BatchNorm())
       layer.add(nn.Activation('relu'))
       layer.add(nn.Conv3D(channels=nout, kernel_size=ks, padding=pad, 
strides=stride))
       return layer
   
   def bn_relu_block(growth_rate):
       layer = nn.HybridSequential()
       layer.add(bn_relu_conv(1, nout=growth_rate, stride=1, pad=0))
       layer.add(bn_relu_conv(3, nout=growth_rate, stride=1, pad=1))
       return layer
   
   def conv_act_layer(channels, kernel=(1,1,1) , pad=(0,0,0), stride=(1,1,1), 
act_type="relu", use_batchnorm=False):
       layer = nn.HybridSequential()
       layer.add(nn.Conv3D(channels=channels, kernel_size=kernel, padding=pad, 
strides=stride))
       layer.add(nn.BatchNorm())
       layer.add(nn.Activation(act_type))
       return layer
   
   def transition(channels):
       transition_layer = nn.HybridSequential()
       transition_layer.add(bn_relu_conv(ks=1, nout=channels, stride=1, pad=0))
       transition_layer.add(nn.MaxPool3D(pool_size=2, strides=2))
       return transition_layer
   
   def transition_w_o_pooling(channels):
       layer = bn_relu_conv(ks=1, nout=channels, stride=1, pad=0)
       return layer
   
   
   class DsodBlock(nn.HybridBlock):
       def __init__(self, layers, growth_rate, use_memonger=False, **kwargs):
           super(DsodBlock, self).__init__(**kwargs)
           self.use_memonger = use_memonger
           self.net = nn.HybridSequential()
           for i in range(layers):
               lay = bn_relu_block(growth_rate)
               self.net.add(lay)
   
       def hybrid_forward(self, F, x):
           for idx, layer in enumerate(self.net):
               out = layer(x)
               x = F.concat(x, out, dim=1)
               if self.use_memonger and (idx % 2 == 0):
                   #print("use memonger true")
                   x._set_attr(mirror_stage='True')
   
           return x
   
   class DenseNet(nn.HybridBlock):
       def __init__(self, net_def, num_classes, growth_rate, 
use_memonger=False, batch_size=32, input_depth=16, input_size=112, **kwargs):
           super(DenseNet, self).__init__(**kwargs)
           channels = 128
           self.use_memonger = use_memonger
           self.batch_size = batch_size
           self.input_depth = input_depth
           self.input_size = input_size
   
           #assert self.use_memonger
           with self.name_scope():
               self.features = nn.HybridSequential(prefix='')
               self.features.add(conv_act_layer(64, kernel=3, pad=1, stride=2, 
act_type="relu", use_batchnorm=True))
               self.features.add(conv_act_layer(64, kernel=3, pad=1, stride=1, 
act_type="relu", use_batchnorm=True))
               self.features.add(conv_act_layer(128, kernel=3, pad=1, stride=1, 
act_type="relu", use_batchnorm=True))
   
               self.features.add(nn.MaxPool3D(pool_size=2, strides=2, 
padding=1))
   
               for i,(dense_layers,transition_fun) in enumerate(net_def):
                   self.features.add(DsodBlock(layers=dense_layers, 
growth_rate=growth_rate, use_memonger=use_memonger))
                   channels += growth_rate*dense_layers
                   self.features.add(transition_fun(channels))
   
               self.features.add(nn.BatchNorm())
               self.features.add(nn.Activation('relu'))
               self.features.add(nn.GlobalAvgPool3D())
               self.features.add(nn.Flatten())
   
               self.output = nn.Dense(num_classes, in_units=channels)
   
       def _get_graph(self, *args):
           #assert False
           if not self._cached_graph:
               args, self._in_format = _flatten(args)
               if len(args) > 1:
                   inputs = [mx.symbol.var('data%d'%i) for i in 
range(len(args))]
               else:
                   inputs = [mx.symbol.var('data')]
               grouped_inputs = _regroup(inputs, self._in_format)[0]
   
               params = {i: j.var() for i, j in self._reg_params.items()}
               with self.name_scope():
                   out = self.hybrid_forward(mx.symbol, *grouped_inputs, 
**params)  # pylint: disable=no-value-for-parameter
               out, self._out_format = _flatten(out)
   
               assert len(out) == 1
               if self.use_memonger:
                   assert len(inputs) == 1
                   out = memonger.search_plan(out[0], data=(self.batch_size, 3, 
self.input_depth, self.input_size, self.input_size))
                   out = [out]
   
               self._cached_graph = inputs, out[0] #mx.symbol.Group(out)
   
           return self._cached_graph
   
   
       def hybrid_forward(self, F, x):
           x = self.features(x)
           x = self.output(x)
           return x
   
   
   def dsod_net(net_def, num_classes, growth_rate=64):
       growth_rate = growth_rate
       channels = 128
       net = nn.HybridSequential()
       with net.name_scope():
           ## dsod backbone
           net.add(conv_act_layer(64, kernel=3, pad=1, stride=2, 
act_type="relu", use_batchnorm=True))
           net.add(conv_act_layer(64, kernel=3, pad=1, stride=1, 
act_type="relu", use_batchnorm=True))
           net.add(conv_act_layer(128, kernel=3, pad=1, stride=1, 
act_type="relu", use_batchnorm=True))
   
           net.add(nn.MaxPool3D(pool_size=2, strides=2, padding=1))
   
           for i,(dense_layers,transition_fun) in enumerate(net_def):
               net.add(DsodBlock(layers=dense_layers, growth_rate=growth_rate))
               channels += growth_rate*dense_layers
               net.add(transition_fun(channels))
   
           classifier = nn.HybridSequential()
           classifier.add(nn.BatchNorm())
           classifier.add(nn.Activation('relu'))
           classifier.add(nn.GlobalAvgPool3D())
           classifier.add(nn.Flatten())
           classifier.add(nn.Dense(num_classes))
   
           net.add(classifier)
   
       return net
   
   def dsod_net_v2(net_def, num_classes, growth_rate=64, use_memonger=False, 
**kwargs):
       net = DenseNet(net_def, num_classes, growth_rate, use_memonger, **kwargs)
       return net
   
   def get_net(net_depth, num_classes, hybridize=True, growth_rate=64, 
**kwargs):
       densenet_spec = {30:[(6,transition), (8,transition), 
(8,transition_w_o_pooling), (8,transition_w_o_pooling)],
                        22:[(6,transition), (8,transition), 
(8,transition_w_o_pooling)]}
       net_def =  densenet_spec[net_depth]
       #net = dsod_net(net_def, num_classes, growth_rate)
       net = dsod_net_v2(net_def, num_classes, growth_rate, False, **kwargs)
   
       if hybridize:
           net.hybridize()
       return net
   
   
   if __name__ == '__main__':
       dsod = get_net(22, 101, True, 32)
       #print dsod
       dsod.initialize(ctx=mx.gpu(4))
       x = mx.nd.ones((32,3,16,112,112), ctx=mx.gpu(4))
       res = dsod(x)
       #print res.shape
       #print res    
   
   
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to