[incubator-mxnet] branch master updated: [MXNET-1327] Allow RNN Layers to be initialized to fp16 (#14219)

haibin Tue, 12 Mar 2019 14:31:03 -0700

This is an automated email from the ASF dual-hosted git repository.

haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new 6aa8c27  [MXNET-1327] Allow RNN Layers to be initialized to fp16 
(#14219)
6aa8c27 is described below

commit 6aa8c27a69cafc65eb97c4e077cb399462b617e2
Author: Thomas Delteil <[email protected]>
AuthorDate: Tue Mar 12 14:30:19 2019 -0700

    [MXNET-1327] Allow RNN Layers to be initialized to fp16 (#14219)
    
    * update rnn for fp16
    
    * fix typo in test
    
    * fix tests
    
    * fix tests
    
    * fix gpu tests
    
    * Update test_gluon_rnn.py
    
    * Update test_gluon_rnn.py
    
    * trigger
    
    * try removing checks for unix
---
 python/mxnet/gluon/rnn/rnn_layer.py     | 59 ++++++++++++--------
 tests/python/unittest/test_gluon_rnn.py | 98 ++++++++++++++++++++++-----------
 2 files changed, 101 insertions(+), 56 deletions(-)

diff --git a/python/mxnet/gluon/rnn/rnn_layer.py 
b/python/mxnet/gluon/rnn/rnn_layer.py
index c43dc85..6dfec43 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -37,7 +37,7 @@ class _RNNLayer(HybridBlock):
                  i2h_bias_initializer, h2h_bias_initializer,
                  mode, projection_size, h2r_weight_initializer,
                  lstm_state_clip_min, lstm_state_clip_max, lstm_state_clip_nan,
-                 **kwargs):
+                 dtype, **kwargs):
         super(_RNNLayer, self).__init__(**kwargs)
         assert layout in ('TNC', 'NTC'), \
             "Invalid layout %s; must be one of ['TNC' or 'NTC']"%layout
@@ -57,6 +57,7 @@ class _RNNLayer(HybridBlock):
         self._lstm_state_clip_min = lstm_state_clip_min
         self._lstm_state_clip_max = lstm_state_clip_max
         self._lstm_state_clip_nan = lstm_state_clip_nan
+        self._dtype = dtype
 
         self._gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
 
@@ -66,16 +67,16 @@ class _RNNLayer(HybridBlock):
                 for j in ['l', 'r'][:self._dir]:
                     self._register_param('{}{}_i2h_weight'.format(j, i),
                                          shape=(ng*nh, ni),
-                                         init=i2h_weight_initializer)
+                                         init=i2h_weight_initializer, 
dtype=dtype)
                     self._register_param('{}{}_h2h_weight'.format(j, i),
                                          shape=(ng*nh, nh),
-                                         init=h2h_weight_initializer)
+                                         init=h2h_weight_initializer, 
dtype=dtype)
                     self._register_param('{}{}_i2h_bias'.format(j, i),
                                          shape=(ng*nh,),
-                                         init=i2h_bias_initializer)
+                                         init=i2h_bias_initializer, 
dtype=dtype)
                     self._register_param('{}{}_h2h_bias'.format(j, i),
                                          shape=(ng*nh,),
-                                         init=h2h_bias_initializer)
+                                         init=h2h_bias_initializer, 
dtype=dtype)
                 ni = nh * self._dir
         else:
             np = self._projection_size
@@ -83,24 +84,24 @@ class _RNNLayer(HybridBlock):
                 for j in ['l', 'r'][:self._dir]:
                     self._register_param('{}{}_i2h_weight'.format(j, i),
                                          shape=(ng*nh, ni),
-                                         init=i2h_weight_initializer)
+                                         init=i2h_weight_initializer, 
dtype=dtype)
                     self._register_param('{}{}_h2h_weight'.format(j, i),
                                          shape=(ng*nh, np),
-                                         init=h2h_weight_initializer)
+                                         init=h2h_weight_initializer, 
dtype=dtype)
                     self._register_param('{}{}_i2h_bias'.format(j, i),
                                          shape=(ng*nh,),
-                                         init=i2h_bias_initializer)
+                                         init=i2h_bias_initializer, 
dtype=dtype)
                     self._register_param('{}{}_h2h_bias'.format(j, i),
                                          shape=(ng*nh,),
-                                         init=h2h_bias_initializer)
+                                         init=h2h_bias_initializer, 
dtype=dtype)
                     self._register_param('{}{}_h2r_weight'.format(j, i),
                                          shape=(np, nh),
-                                         init=h2r_weight_initializer)
+                                         init=h2r_weight_initializer, 
dtype=dtype)
                 ni = np * self._dir
 
-    def _register_param(self, name, shape, init):
+    def _register_param(self, name, shape, init, dtype):
         p = self.params.get(name, shape=shape, init=init,
-                            allow_deferred_init=True)
+                            allow_deferred_init=True, dtype=dtype)
         setattr(self, name, p)
         return p
 
@@ -179,6 +180,10 @@ class _RNNLayer(HybridBlock):
 
         return stack
 
+    def cast(self, dtype):
+        super(_RNNLayer, self).cast(dtype)
+        self._dtype = dtype
+
     def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
         """Initial state for this cell.
 
@@ -317,6 +322,8 @@ class RNN(_RNNLayer):
     input_size: int, default 0
         The number of expected features in the input x.
         If not specified, it will be inferred from input.
+    dtype : str, default 'float32'
+        Type to initialize the parameters and default states to
     prefix : str or None
         Prefix of this `Block`.
     params : ParameterDict or None
@@ -357,17 +364,17 @@ class RNN(_RNNLayer):
                  layout='TNC', dropout=0, bidirectional=False,
                  i2h_weight_initializer=None, h2h_weight_initializer=None,
                  i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
-                 input_size=0, **kwargs):
+                 input_size=0, dtype='float32', **kwargs):
         super(RNN, self).__init__(hidden_size, num_layers, layout,
                                   dropout, bidirectional, input_size,
                                   i2h_weight_initializer, 
h2h_weight_initializer,
                                   i2h_bias_initializer, h2h_bias_initializer,
                                   'rnn_'+activation, None, None, None, None, 
False,
-                                  **kwargs)
+                                  dtype, **kwargs)
 
     def state_info(self, batch_size=0):
         return [{'shape': (self._num_layers * self._dir, batch_size, 
self._hidden_size),
-                 '__layout__': 'LNC'}]
+                 '__layout__': 'LNC', 'dtype': self._dtype}]
 
 
 class LSTM(_RNNLayer):
@@ -432,6 +439,8 @@ class LSTM(_RNNLayer):
     state_clip_nan : boolean, default False
         Whether to stop NaN from propagating in state by clipping it to 
min/max.
         If the clipping range is not specified, this option is ignored.
+    dtype : str, default 'float32'
+        Type to initialize the parameters and default states to
     input_size: int, default 0
         The number of expected features in the input x.
         If not specified, it will be inferred from input.
@@ -477,26 +486,26 @@ class LSTM(_RNNLayer):
                  i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
                  projection_size=None, h2r_weight_initializer=None,
                  state_clip_min=None, state_clip_max=None, 
state_clip_nan=False,
-                 **kwargs):
+                 dtype='float32', **kwargs):
         super(LSTM, self).__init__(hidden_size, num_layers, layout,
                                    dropout, bidirectional, input_size,
                                    i2h_weight_initializer, 
h2h_weight_initializer,
                                    i2h_bias_initializer, h2h_bias_initializer,
                                    'lstm', projection_size, 
h2r_weight_initializer,
                                    state_clip_min, state_clip_max, 
state_clip_nan,
-                                   **kwargs)
+                                   dtype, **kwargs)
 
     def state_info(self, batch_size=0):
         if self._projection_size is None:
             return [{'shape': (self._num_layers * self._dir, batch_size, 
self._hidden_size),
-                     '__layout__': 'LNC'},
+                     '__layout__': 'LNC', 'dtype': self._dtype},
                     {'shape': (self._num_layers * self._dir, batch_size, 
self._hidden_size),
-                     '__layout__': 'LNC'}]
+                     '__layout__': 'LNC', 'dtype': self._dtype}]
         else:
             return [{'shape': (self._num_layers * self._dir, batch_size, 
self._projection_size),
-                     '__layout__': 'LNC'},
+                     '__layout__': 'LNC', 'dtype': self._dtype},
                     {'shape': (self._num_layers * self._dir, batch_size, 
self._hidden_size),
-                     '__layout__': 'LNC'}]
+                     '__layout__': 'LNC', 'dtype': self._dtype}]
 
 
 class GRU(_RNNLayer):
@@ -544,6 +553,8 @@ class GRU(_RNNLayer):
         Initializer for the bias vector.
     h2h_bias_initializer : str or Initializer
         Initializer for the bias vector.
+    dtype : str, default 'float32'
+        Type to initialize the parameters and default states to
     input_size: int, default 0
         The number of expected features in the input x.
         If not specified, it will be inferred from input.
@@ -586,14 +597,14 @@ class GRU(_RNNLayer):
                  dropout=0, bidirectional=False, input_size=0,
                  i2h_weight_initializer=None, h2h_weight_initializer=None,
                  i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
-                 **kwargs):
+                 dtype='float32', **kwargs):
         super(GRU, self).__init__(hidden_size, num_layers, layout,
                                   dropout, bidirectional, input_size,
                                   i2h_weight_initializer, 
h2h_weight_initializer,
                                   i2h_bias_initializer, h2h_bias_initializer,
                                   'gru', None, None, None, None, False,
-                                  **kwargs)
+                                  dtype, **kwargs)
 
     def state_info(self, batch_size=0):
         return [{'shape': (self._num_layers * self._dir, batch_size, 
self._hidden_size),
-                 '__layout__': 'LNC'}]
+                 '__layout__': 'LNC', 'dtype': self._dtype}]
diff --git a/tests/python/unittest/test_gluon_rnn.py 
b/tests/python/unittest/test_gluon_rnn.py
index edc43d2..b410362 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -427,9 +427,15 @@ def test_rnn_cells_export_import():
         assert_almost_equal(output1.asnumpy(), output2.asnumpy())
 
 
-def check_rnn_layer_forward(layer, inputs, states=None, run_only=False):
-    layer.collect_params().initialize()
+def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, 
ctx=mx.cpu()):
+    layer.collect_params().initialize(ctx=ctx)
+    inputs = inputs.as_in_context(ctx)
     inputs.attach_grad()
+    if states is not None:
+        if isinstance(states, (list, tuple)):
+            states = [s.as_in_context(ctx) for s in states]
+        else:
+            states = states.as_in_context(ctx)
     with mx.autograd.record():
         if states is None:
             out = layer(inputs)
@@ -467,47 +473,76 @@ def check_rnn_layer_forward(layer, inputs, states=None, 
run_only=False):
         mx.test_utils.assert_almost_equal(np_dx, inputs.grad.asnumpy(), 
rtol=1e-3, atol=1e-5)
 
 
-@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-def test_rnn_layers():
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2), mx.nd.ones((8, 3, 20)))
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, bidirectional=True), 
mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)))
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2), mx.nd.ones((8, 3, 20)))
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True), 
mx.nd.ones((8, 3, 20)), [mx.nd.ones((4, 3, 10)), mx.nd.ones((4, 3, 10))])
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2), mx.nd.ones((8, 3, 20)))
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True), 
mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)))
-
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dropout=0.5), mx.nd.ones((8, 
3, 20)),
-                            run_only=True)
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, bidirectional=True, 
dropout=0.5),
-                            mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)), 
run_only=True)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5), mx.nd.ones((8, 
3, 20)),
-                            run_only=True)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, 
dropout=0.5),
-                            mx.nd.ones((8, 3, 20)),
-                            [mx.nd.ones((4, 3, 10)), mx.nd.ones((4, 3, 10))], 
run_only=True)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dropout=0.5), mx.nd.ones((8, 
3, 20)),
-                            run_only=True)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True, 
dropout=0.5),
-                            mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)), 
run_only=True)
+
+def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
+
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.nd.ones((8, 
3, 20), dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, 
bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), mx.nd.ones((4, 3, 
10),  dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype), mx.nd.ones((8, 
3, 20),  dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype,  
bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), [mx.nd.ones((4, 3, 
10),  dtype=dtype), mx.nd.ones((4, 3, 10),  dtype=dtype)],ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, ), 
mx.nd.ones((8, 3, 20), dtype=dtype),ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, 
bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), mx.nd.ones((4, 3, 
10),  dtype=dtype),ctx=ctx)
+
+
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, dropout=0.5), 
mx.nd.ones((8, 3, 20), dtype=dtype),
+                            run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, bidirectional=True, 
dropout=0.5, dtype=dtype),
+                            mx.nd.ones((8, 3, 20), dtype=dtype), 
mx.nd.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, dtype=dtype), 
mx.nd.ones((8, 3, 20), dtype=dtype),
+                            run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, 
dropout=0.5, dtype=dtype),
+                            mx.nd.ones((8, 3, 20), dtype=dtype),
+                            [mx.nd.ones((4, 3, 10), dtype=dtype), 
mx.nd.ones((4, 3, 10), dtype=dtype)], run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dropout=0.5, dtype=dtype), 
mx.nd.ones((8, 3, 20), dtype=dtype),
+                            run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True, 
dropout=0.5, dtype=dtype),
+                            mx.nd.ones((8, 3, 20), dtype=dtype), 
mx.nd.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
 
     net = gluon.nn.Sequential()
-    net.add(gluon.rnn.LSTM(10, bidirectional=True))
+    net.add(gluon.rnn.LSTM(10, bidirectional=True, dtype=dtype2))
     net.add(gluon.nn.BatchNorm(axis=2))
     net.add(gluon.nn.Flatten())
     net.add(gluon.nn.Dense(3, activation='relu'))
-    net.collect_params().initialize()
+    net.collect_params().initialize(ctx=ctx)
+    net.cast(dtype)
     with mx.autograd.record():
-        net(mx.nd.ones((2, 3, 10))).backward()
+        out = net(mx.nd.ones((2, 3, 10), dtype=dtype, ctx=ctx))
+        out.backward()
+        out = out.asnumpy()
 
     net2 = gluon.nn.HybridSequential()
-    net2.add(gluon.rnn.LSTM(10, bidirectional=True))
+    net2.add(gluon.rnn.LSTM(10, bidirectional=True, dtype=dtype2))
     net2.add(gluon.nn.BatchNorm(axis=2))
     net2.add(gluon.nn.Flatten())
     net2.add(gluon.nn.Dense(3, activation='relu'))
     net2.hybridize()
-    net2.collect_params().initialize()
+    net2.collect_params().initialize(ctx=ctx)
+    net2.cast(dtype)
+    with mx.autograd.record():
+        out = net2(mx.nd.ones((2, 3, 10), dtype=dtype, ctx=ctx))
+        out.backward()
+        out = out.asnumpy()
+
+    net3 = gluon.nn.HybridSequential()
+    net3.add(gluon.rnn.LSTM(10, bidirectional=True, dtype=dtype))
+    net3.add(gluon.nn.BatchNorm(axis=2))
+    net3.add(gluon.nn.Flatten())
+    net3.add(gluon.nn.Dense(3, activation='relu'))
+    net3.hybridize()
+    net3.collect_params().initialize(ctx=ctx)
+    net3.cast(dtype2)
     with mx.autograd.record():
-        net2(mx.nd.ones((2, 3, 10))).backward()
+        out = net3(mx.nd.ones((2, 3, 10), dtype=dtype2, ctx=ctx))
+        out.backward()
+        out = out.asnumpy()
+
+def test_rnn_layers_fp32():
+    run_rnn_layers('float32', 'float32')
+
+@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
[email protected](mx.context.num_gpus() == 0, "RNN FP16 only implemented for 
GPU for now")
+def test_rnn_layers_fp16():
+    run_rnn_layers('float16', 'float32', mx.gpu())
 
 
 def test_rnn_unroll_variant_length():
@@ -590,8 +625,6 @@ def test_cell_fill_shape():
     check_rnn_forward(cell, mx.nd.ones((2, 3, 7)))
     assert cell.i2h_weight.shape[1] == 7, cell.i2h_weight.shape[1]
 
-
-@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_layer_fill_shape():
     layer = gluon.rnn.LSTM(10)
     layer.hybridize()
@@ -603,6 +636,7 @@ def test_layer_fill_shape():
 def test_bidirectional_unroll_valid_length():
     # Test BidirectionalCell.
     # In 1.3.1 version, after hybridize( ), BidirectionalCell would failed 
when pass valid_length to unroll( ).
+    
     class BiLSTM(gluon.nn.HybridBlock):
         def __init__(self, rnn_size, time_step, **kwargs):
             super(BiLSTM, self).__init__(**kwargs)

[incubator-mxnet] branch master updated: [MXNET-1327] Allow RNN Layers to be initialized to fp16 (#14219)

Reply via email to