[GitHub] [incubator-mxnet] kohillyang commented on issue #18902: Got "kFlag == type_flag_: TBlob.get_with_shape: data type do not match specified type.Expected: 0 v.s. given 2" when training with amp.

GitBox Tue, 11 Aug 2020 06:48:10 -0700


kohillyang commented on issue #18902:
URL: 
https://github.com/apache/incubator-mxnet/issues/18902#issuecomment-671957553



   @szha The following codes can reproduce the above error.
   ```bash
   from __future__ import print_function
   
   import mxnet as mx
   import mxnet.autograd as ag
   import numpy as np
   import gluoncv
   
   
   class FCOS_Head(mx.gluon.nn.HybridBlock):
       def __init__(self, num_classes):
           super(FCOS_Head, self).__init__()
           with self.name_scope():
               self.feat_cls = mx.gluon.nn.HybridSequential()
               init = mx.init.Normal(sigma=0.01)
               init.set_verbosity(True)
               init_bias = mx.init.Constant(-1 * np.log((1 - 0.01) / 0.01))
               init_bias.set_verbosity(True)
               for i in range(4):
                   self.feat_cls.add(mx.gluon.nn.Conv2D(channels=256, 
kernel_size=3, padding=1, weight_initializer=init))
                   self.feat_cls.add(mx.gluon.nn.GroupNorm(num_groups=32))
                   self.feat_cls.add(mx.gluon.nn.Activation(activation="relu"))
               self.feat_cls.add(mx.gluon.nn.Conv2D(channels=num_classes - 1, 
kernel_size=1, padding=0,
                                                    bias_initializer=init_bias, 
weight_initializer=init))
   
               self.feat_reg = mx.gluon.nn.HybridSequential()
               for i in range(4):
                   self.feat_reg.add(mx.gluon.nn.Conv2D(channels=256, 
kernel_size=3, padding=1, weight_initializer=init))
                   self.feat_reg.add(mx.gluon.nn.GroupNorm(num_groups=32))
                   self.feat_reg.add(mx.gluon.nn.Activation(activation="relu"))
   
               # one extra channel for center-ness, four channel for location 
regression.
               self.feat_reg_loc = mx.gluon.nn.Conv2D(channels=4, 
kernel_size=1, padding=0, weight_initializer=init)
               self.feat_reg_centerness = mx.gluon.nn.Conv2D(channels=1, 
kernel_size=1, padding=0, weight_initializer=init)
   
       def hybrid_forward(self, F, x):
           feat_reg = self.feat_reg(x)
           x_loc = self.feat_reg_loc(feat_reg)
           x_centerness = self.feat_reg_centerness(feat_reg)
           x_cls = self.feat_cls(x)
           x = F.concat(x_loc, x_centerness, x_cls, dim=1)
           return x
   
   
   class resnet(mx.gluon.nn.HybridBlock):
       def __init__(self):
           super(resnet, self).__init__()
           self.feat = gluoncv.model_zoo.resnet50_v1b(pretrained=False)
           self.mean = self.params.get('mean', shape=[1, 3, 1, 1],
                                       init=mx.init.Zero(),
                                       allow_deferred_init=False, 
grad_req='null')
           self.std = self.params.get('std', shape=[1, 3, 1, 1],
                                      init=mx.init.One(),  # mx.nd.array(),
                                      allow_deferred_init=False, 
grad_req='null')
           self.mean._load_init(mx.nd.array([[[[0.485]], [[0.456]], 
[[0.406]]]]), ctx=mx.cpu())
           self.std._load_init(mx.nd.array([[[[0.229]], [[0.224]], 
[[0.225]]]]), ctx=mx.cpu())
   
       def hybrid_forward(self, F, x, mean, std):
           input = F.transpose(x, (0, 3, 1, 2))
           x = input / 255.0
           x = F.broadcast_sub(x, mean)
           x = F.broadcast_div(x, std)
           x = self.feat.conv1(x)
           x = self.feat.bn1(x)
           x = self.feat.relu(x)
           x = self.feat.maxpool(x)
   
           res2 = self.feat.layer1(x)
           res3 = self.feat.layer2(res2)
           res4 = self.feat.layer3(res3)
           res5 = self.feat.layer4(res4)
   
           return res5
   
   
   class FCOSFPNNet(mx.gluon.nn.HybridBlock):
       def __init__(self, num_classes):
           super(FCOSFPNNet, self).__init__()
           self.backbone = resnet()
           self.fcos_head = FCOS_Head(num_classes)
   
       def hybrid_forward(self, F, x):
           # typically the strides are (4, 8, 16, 32, 64)
           x = self.backbone(x)
           if isinstance(x, list) or isinstance(x, tuple):
               return [self.fcos_head(xx) for xx in x]
           else:
               return [self.fcos_head(x)]
   
   
   def train_net():
       mx.random.seed(3)
       np.random.seed(3)
   
       ctx_list = [mx.gpu(0)]
       net = FCOSFPNNet(11)
       # Initialize parameters
       params = net.collect_params()
       for key in params.keys():
           if params[key]._data is None:
               default_init = mx.init.Zero() if "bias" in key or "offset" in 
key else mx.init.Normal()
               default_init.set_verbosity(True)
               if params[key].init is not None and hasattr(params[key].init, 
"set_verbosity"):
                   params[key].init.set_verbosity(True)
                   params[key].initialize(init=params[key].init, 
default_init=params[key].init)
               else:
                   params[key].initialize(default_init=default_init)
   
       net.collect_params().reset_ctx(list(set(ctx_list)))
       if True:
           from mxnet.contrib import amp
           amp.init()
           net.cast("float16")
           net.collect_params('.*batchnorm.*').setattr('dtype', 'float32')
   
       trainer = mx.gluon.Trainer(
           net.collect_params(),  # fix batchnorm, fix first stage, etc...
           'sgd',
           {'wd': 1e-4,
            'momentum': .9,
            'clip_gradient': None,
            'lr_scheduler': None,
            'multi_precision': True,
            },
           update_on_kvstore=(False if True else None), 
kvstore=mx.kvstore.create('local')
       )
       if True:
           amp.init_trainer(trainer)
   
       with ag.record():
           data = mx.nd.zeros(shape=(1, 368, 368, 3), ctx=ctx_list[0])
           fpn_predictions = net(data)
           preds = mx.nd.concat(*[x.reshape((0, 0, -1)) for x in 
fpn_predictions], dim=2)
           with amp.scale_loss(preds.sum(), trainer) as scaled_losses:
               ag.backward(scaled_losses)
       trainer.step(1, ignore_stale_grad=True)
   
   
   if __name__ == '__main__':
       train_net()
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [incubator-mxnet] kohillyang commented on issue #18902: Got "kFlag == type_flag_: TBlob.get_with_shape: data type do not match specified type.Expected: 0 v.s. given 2" when training with amp.

Reply via email to