Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
[v2.0] RNN: use rnn_params (#20384)
Browse files Browse the repository at this point in the history
* use rnn_params

* add split rnn parameter in gluon.utils

* update

* update

* use zero weight

* add rnn fused parameter initializer

* fix lint

* fix tests

* update RNNFused initializer

* fix

* fix

* fix leak

* fix

* fix

* fix

* update

* update centos cu102 to use cudnn8

* fix

* fix conflict
  • Loading branch information
barry-jin authored Oct 20, 2021
1 parent 481eba7 commit 5f0efbb
Show file tree
Hide file tree
Showing 8 changed files with 339 additions and 122 deletions.
2 changes: 1 addition & 1 deletion ci/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ services:
dockerfile: Dockerfile.build.centos7
target: base
args:
BASE_IMAGE: nvidia/cuda:10.2-cudnn7-devel-centos7
BASE_IMAGE: nvidia/cuda:10.2-cudnn8-devel-centos7
cache_from:
- ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu102:latest
centos7_gpu_cu110:
Expand Down
3 changes: 3 additions & 0 deletions python/mxnet/gluon/parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,9 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
ctx = [context.current_context()]
if isinstance(ctx, Context):
ctx = [ctx]
if isinstance(self.init, initializer.RNNFused):
self.init.set_initializer(init if init else default_init)
init = default_init = self.init
if init is None:
init = default_init if self.init is None else self.init
if not shape_is_known(self.shape):
Expand Down
105 changes: 29 additions & 76 deletions python/mxnet/gluon/rnn/rnn_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

__all__ = ['RNN', 'LSTM', 'GRU']

from ... import np, npx, context
from ... import np, npx, context, initializer
from .. import HybridBlock, tensor_types
from ..parameter import Parameter
from ...util import use_np
Expand All @@ -50,11 +50,6 @@ def __init__(self, hidden_size, num_layers, layout,
self._dropout = dropout
self._dir = 2 if bidirectional else 1
self._input_size = input_size
self._i2h_weight_initializer = i2h_weight_initializer
self._h2h_weight_initializer = h2h_weight_initializer
self._i2h_bias_initializer = i2h_bias_initializer
self._h2h_bias_initializer = h2h_bias_initializer
self._h2r_weight_initializer = h2r_weight_initializer
self._lstm_state_clip_min = lstm_state_clip_min
self._lstm_state_clip_max = lstm_state_clip_max
self._lstm_state_clip_nan = lstm_state_clip_nan
Expand All @@ -64,48 +59,17 @@ def __init__(self, hidden_size, num_layers, layout,

self._gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]

ng, ni, nh = self._gates, input_size, hidden_size
if not projection_size:
for i in range(num_layers):
for j in ['l', 'r'][:self._dir]:
self._register_param('{}{}_i2h_weight'.format(j, i),
shape=(ng*nh, ni),
init=i2h_weight_initializer, dtype=dtype)
self._register_param('{}{}_h2h_weight'.format(j, i),
shape=(ng*nh, nh),
init=h2h_weight_initializer, dtype=dtype)
self._register_param('{}{}_i2h_bias'.format(j, i),
shape=(ng*nh,),
init=i2h_bias_initializer, dtype=dtype)
self._register_param('{}{}_h2h_bias'.format(j, i),
shape=(ng*nh,),
init=h2h_bias_initializer, dtype=dtype)
ni = nh * self._dir
else:
ps = self._projection_size
for i in range(num_layers):
for j in ['l', 'r'][:self._dir]:
self._register_param('{}{}_i2h_weight'.format(j, i),
shape=(ng*nh, ni),
init=i2h_weight_initializer, dtype=dtype)
self._register_param('{}{}_h2h_weight'.format(j, i),
shape=(ng*nh, ps),
init=h2h_weight_initializer, dtype=dtype)
self._register_param('{}{}_i2h_bias'.format(j, i),
shape=(ng*nh,),
init=i2h_bias_initializer, dtype=dtype)
self._register_param('{}{}_h2h_bias'.format(j, i),
shape=(ng*nh,),
init=h2h_bias_initializer, dtype=dtype)
self._register_param('{}{}_h2r_weight'.format(j, i),
shape=(ps, nh),
init=h2r_weight_initializer, dtype=dtype)
ni = ps * self._dir

def _register_param(self, name, shape, init, dtype):
p = Parameter(name, shape=shape, init=init, allow_deferred_init=True, dtype=dtype)
setattr(self, name, p)
return p
param_initializer = initializer.RNNFused(
mode, num_layers, hidden_size,
bidirectional, projection_size,
i2h_weight_initializer=i2h_weight_initializer,
h2h_weight_initializer=h2h_weight_initializer,
i2h_bias_initializer=i2h_bias_initializer,
h2h_bias_initializer=h2h_bias_initializer,
h2r_weight_initializer=h2r_weight_initializer)

self.rnn_param = Parameter('rnn_param', shape=(-1,), init=param_initializer,
allow_deferred_init=True, dtype=dtype)

def __repr__(self):
s = '{name}({mapping}, {_layout}'
Expand All @@ -116,8 +80,7 @@ def __repr__(self):
if self._dir == 2:
s += ', bidirectional'
s += ')'
shape = self.l0_i2h_weight.shape
mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0] // self._gates)
mapping = '{0} -> {1}'.format(self._input_size if self._input_size else None, self._hidden_size)
return s.format(name=self.__class__.__name__,
mapping=mapping,
**self.__dict__)
Expand Down Expand Up @@ -196,37 +159,26 @@ def forward(self, inputs, states, sequence_length=None):
def infer_shape(self, inputs, *args):
assert inputs.ndim == 3, \
"Input data should be rank-3 tensor of dim [sequence length, batch size, input size]"
if not self._projection_size:
step = self._hidden_size
else:
step = self._projection_size
ni = inputs.shape[2]
for i in range(self._num_layers):
for j in ['l', 'r'][:self._dir]:
name = '{}{}_i2h_weight'.format(j, i)
getattr(self, name).shape = (self._gates*self._hidden_size, ni)
ni = step * self._dir
self._input_size = inputs.shape[2]
ng, ni, nh = self._gates, inputs.shape[2], self._hidden_size

size = nh * self._dir * ng
size1 = (ni + nh + 2) * size # first layer size
size2 = (nh * self._dir + nh + 2) * size # second layer size
if self._projection_size:
size1 = (ni + self._projection_size + 2) * size # first layer size
size2 = (self._projection_size * self._dir + \
self._projection_size + 2) * size # second layer size
param_size = size1 + (self._num_layers - 1) * size2
if self._projection_size:
param_size += self._projection_size * nh * self._num_layers * self._dir
self.rnn_param.shape = (param_size, )

def _forward_kernel(self, inputs, states, sequence_length):
""" forward using CUDNN or CPU kenrel"""
ctx = inputs.ctx
if self._layout == 'NTC':
inputs = np.swapaxes(inputs, 0, 1)
if self._projection_size is None:
params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
for t in ['weight', 'bias']
for l in range(self._num_layers)
for d in ['l', 'r'][:self._dir]
for g in ['i2h', 'h2h'])
else:
params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
for t in ['weight', 'bias']
for l in range(self._num_layers)
for d in ['l', 'r'][:self._dir]
for g in ['i2h', 'h2h', 'h2r']
if g != 'h2r' or t != 'bias')

params = np.concatenate(params, axis=0)

if self._use_sequence_length:
rnn_args = states + [sequence_length]
Expand All @@ -238,7 +190,8 @@ def _forward_kernel(self, inputs, states, sequence_length):
new_args = args.as_in_ctx(ctx)
rnn_args_ctx.append(new_args)

rnn = npx.rnn(inputs, params, *rnn_args_ctx, use_sequence_length=self._use_sequence_length,
rnn = npx.rnn(inputs, self.rnn_param.data().as_in_ctx(ctx), *rnn_args_ctx,
use_sequence_length=self._use_sequence_length,
state_size=self._hidden_size, projection_size=self._projection_size,
num_layers=self._num_layers, bidirectional=self._dir == 2,
p=self._dropout, state_outputs=True, mode=self._mode,
Expand Down
76 changes: 76 additions & 0 deletions python/mxnet/gluon/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,3 +504,79 @@ def _check_block_input_np_ndarrays(inputs):
for i in inputs:
_check_block_input_np_ndarrays(i)
# pylint: enable=no-else-raise


# pylint: disable=too-many-nested-blocks
def split_rnn_params(param, mode, num_layers, input_size, hidden_size, bidirectional=False, projection_size=None):
"""Split rnn layer parameter into weight and bias in different layer.
Parameters
----------
param : ndarray
The parameter of rnn layer.
mode : str
Mode of rnn. Supported modes: rnn_relu, rnn_tanh, lstm, gru
num_layers : int, default 1
Number of recurrent layers.
input_size: int, default 0
The number of expected features in the input x.
If not specified, it will be inferred from input.
hidden_size: int
The number of features in the hidden state h.
bidirectional: bool, default False
If `True`, becomes a bidirectional RNN.
projection_size: int, default None
The number of features after projection.
"""
gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
dir = 2 if bidirectional else 1
param_dict = {}
begin = 0
if not projection_size:
for p in ['weight', 'bias']:
for l in range(num_layers):
for d in ['l', 'r'][:dir]:
for g in ['i2h', 'h2h']:
ni = input_size
if l != 0:
ni = hidden_size * dir
if g == 'h2h':
ni = hidden_size
shape0 = gates * hidden_size
if p == 'weight':
cur_len = shape0 * ni
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len].reshape(shape0, ni)
else:
cur_len = shape0
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len].reshape(shape0,)
begin += cur_len
else:
for p in ['weight', 'bias']:
for l in range(num_layers):
for d in ['l', 'r'][:dir]:
for g in ['i2h', 'h2h', 'h2r']:
if g != 'h2r' or p != 'bias':
if g == 'h2r':
cur_len = projection_size * hidden_size
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len]. \
reshape(projection_size, hidden_size)
else:
ni = input_size
if l != 0:
ni = projection_size * dir
if g == 'h2h':
ni = projection_size
shape0 = gates * hidden_size
if p == 'weight':
cur_len = shape0 * ni
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len].reshape(shape0, ni)
else:
cur_len = shape0
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len].reshape(shape0,)
begin += cur_len
return param_dict
119 changes: 119 additions & 0 deletions python/mxnet/initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,3 +711,122 @@ def _init_weight(self, name, arr):
# gate of the 4 LSTM gates, we modify the according values.
num_hidden = int(arr.shape[0] / 4)
arr[num_hidden:2*num_hidden] = self.forget_bias


@register
class RNNFused(Initializer):
"""Initialize RNN fused parameter with bias part initialized to 0.0 and
weight initialized with random values uniformly sampled from a given range.
Parameters
----------
mode : {'gru', 'lstm', 'rnn_relu', 'rnn_tanh'}, required
the type of RNN to compute
num_layers : int (non-negative), required
number of stacked layers
state_size : int (non-negative), required
size of the state for each layer
bidirectional : boolean, optional, default=0
whether to use bidirectional recurrent layers
projection_size : int or None, optional, default='None'
size of project size
scale : float, optional
The bound on the range of the generated random values for weights.
Values are generated from the range [-`scale`, `scale`].
Default scale is 0.07.
"""
def __init__(self, mode, num_layers, state_size, bidirectional=False,
projection_size=None, i2h_weight_initializer=None,
h2h_weight_initializer=None, i2h_bias_initializer=None,
h2h_bias_initializer=None, h2r_weight_initializer=None):
super(RNNFused, self).__init__(mode=mode, num_layers=num_layers,
state_size=state_size,
bidirectional=bidirectional,
projection_size=projection_size,
i2h_weight_initializer=i2h_weight_initializer,
h2h_weight_initializer=h2h_weight_initializer,
i2h_bias_initializer=i2h_bias_initializer,
h2h_bias_initializer=h2h_bias_initializer,
h2r_weight_initializer=h2r_weight_initializer)
self.gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
self.num_layers = num_layers
self.num_hidden = state_size
self.dir = 2 if bidirectional else 1
self.projection_size = projection_size
self._i2h_weight_initializer = i2h_weight_initializer
self._h2h_weight_initializer = h2h_weight_initializer
self._i2h_bias_initializer = i2h_bias_initializer
self._h2h_bias_initializer = h2h_bias_initializer
self._h2r_weight_initializer = h2r_weight_initializer

# pylint: disable=too-many-nested-blocks
def _init_weight(self, name, arr):
arr_len = arr.shape[0]
size = self.num_hidden * self.dir * self.gates
if not self.projection_size:
# second layer size
size2 = (self.num_hidden * self.dir + self.num_hidden + 2) * size
input_size = (arr_len - (self.num_layers - 1) * size2) // \
size - 2 - self.num_hidden
else:
# second layer size
size2 = (self.projection_size * self.dir + self.projection_size + 2) * size
size_projection = self.projection_size * self.num_hidden * self.num_layers * self.dir
input_size = (arr_len - size_projection - (self.num_layers - 1) * size2) // \
size - 2 - self.projection_size
begin = 0
if not self.projection_size:
for param in ['weight', 'bias']:
for layer_num in range(self.num_layers):
for _ in range(self.dir):
for connect in ['i2h', 'h2h']:
num_inputs = input_size
if layer_num != 0:
num_inputs = self.num_hidden * self.dir
if connect == 'h2h':
num_inputs = self.num_hidden
shape0 = self.gates * self.num_hidden
if param == 'weight':
cur_len = shape0 * num_inputs
else:
cur_len = shape0
self._init_util(param, connect, arr[begin:begin+cur_len])
begin += cur_len
else:
for param in ['weight', 'bias']:
for layer_num in range(self.num_layers):
for _ in range(self.dir):
for connect in ['i2h', 'h2h', 'h2r']:
if connect != 'h2r' or param != 'bias':
if connect == 'h2r':
cur_len = self.projection_size * self.num_hidden
else:
num_inputs = input_size
if layer_num != 0:
num_inputs = self.projection_size * self.dir
if connect == 'h2h':
num_inputs = self.projection_size
shape0 = self.gates * self.num_hidden
if param == 'weight':
cur_len = shape0 * num_inputs
else:
cur_len = shape0
self._init_util(param, connect, arr[begin:begin+cur_len])
begin += cur_len

def _init_util(self, param, connect, arr):
name = "_{}_{}_initializer".format(connect, param)
init = getattr(self, name)
create(init)(InitDesc(name, {'__init__': init}), arr)

def set_initializer(self, init):
self._i2h_weight_initializer = \
init if not self._i2h_weight_initializer else 'uniform'
self._h2h_weight_initializer = \
init if not self._h2h_weight_initializer else 'uniform'
self._i2h_bias_initializer = \
init if not self._i2h_bias_initializer else 'zero'
self._h2h_bias_initializer = \
init if not self._i2h_bias_initializer else 'zero'
self._h2r_weight_initializer = \
init if not self._h2r_weight_initializer else 'uniform'
Loading

0 comments on commit 5f0efbb

Please sign in to comment.