Source code for npdl.layers.recurrent

# -*- coding: utf-8 -*-


import numpy as np

from .base import Layer
from .. import activations
from .. import initializations
from ..initializations import _one
from ..initializations import _zero


[docs]class Recurrent(Layer): """A recurrent neural network (RNN) is a class of artificial neural network where connections between units form a directed cycle. This creates an internal state of the network which allows it to exhibit dynamic temporal behavior. Unlike feedforward neural networks, RNNs can use their internal memory to process arbitrary sequences of inputs. This makes them applicable to tasks such as unsegmented connected handwriting recognition[1]_ or speech recognition.[2]_ Parameters ---------- n_out : int hidden number n_in : int or None input dimension nb_batch : int or None batch size nb_seq : int or None sequent length init : npdl.intializations.Initliazer init function inner_init : npdl.intializations.Initliazer inner init function, between hidden to hidden activation : npdl.activations.Activation activation function return_sequence : bool return total sequence or not. References ---------- .. [1] A. Graves, M. Liwicki, S. Fernandez, R. Bertolami, H. Bunke, J. Schmidhuber. A Novel Connectionist System for Improved Unconstrained Handwriting Recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 31, no. 5, 2009. .. [2] H. Sak and A. W. Senior and F. Beaufays. Long short-term memory recurrent neural network architectures for large scale acoustic modeling. Proc. Interspeech, pp338-342, Singapore, Sept. 2010 """ def __init__(self, n_out, n_in=None, nb_batch=None, nb_seq=None, init='glorot_uniform', inner_init='orthogonal', activation='tanh', return_sequence=False): self.n_out = n_out self.n_in = n_in self.nb_batch = nb_batch self.nb_seq = nb_seq self.init = initializations.get(init) self.inner_init = initializations.get(inner_init) self.activation_cls = activations.get(activation).__class__ self.activation = activations.get(activation) self.return_sequence = return_sequence self.out_shape = None self.last_input = None self.last_output = None
[docs] def connect_to(self, prev_layer=None): if prev_layer is not None: assert len(prev_layer.out_shape) == 3 self.n_in = prev_layer.out_shape[-1] self.nb_batch = prev_layer.out_shape[0] or self.nb_batch self.nb_seq = prev_layer.out_shape[1] or self.nb_seq else: assert self.n_in is not None if self.return_sequence: self.out_shape = (self.nb_batch, self.nb_seq, self.n_out) else: self.out_shape = (self.nb_batch, self.n_out)
[docs]class SimpleRNN(Recurrent): """Fully-connected RNN where the output is to be fed back to input. .. math:: o_t = tanh(U_t x_t + W_t o_{t-1} + b_t) Parameters ---------- output_dim: dimension of the internal projections and the final output. init: weight initialization function. Can be the name of an existing function (str), or a npdl function. inner_init: initialization function of the inner cells. activation: activation function. Can be the name of an existing function (str), or a npdl function. return_sequence: if `return_sequences`, 3D `numpy.array` with shape `(batch_size, timesteps, units)` will be returned. Else, return 2D `numpy.array` with shape `(batch_size, units)`. References ---------- .. [1] A Theoretically Grounded Application of Dropout in Recurrent Neural Networks. http://arxiv.org/abs/1512.05287 """ def __init__(self, **kwargs): super(SimpleRNN, self).__init__(**kwargs) self.W, self.dW = None, None self.U, self.dU = None, None self.b, self.db = None, None self.activations = []
[docs] def connect_to(self, prev_layer=None): super(SimpleRNN, self).connect_to(prev_layer) self.W = self.init((self.n_in, self.n_out)) self.U = self.inner_init((self.n_out, self.n_out)) self.b = _zero((self.n_out,))
[docs] def forward(self, input, *args, **kwargs): assert np.ndim(input) == 3, 'Only support batch training.' self.last_input = input nb_batch, nb_timestep, nb_in = input.shape output = _zero((nb_batch, nb_timestep, self.n_out)) if len(self.activations) == 0: self.activations = [self.activation_cls() for _ in range(nb_timestep)] output[:, 0, :] = self.activations[0].forward(np.dot(input[:, 0, :], self.W) + self.b) for i in range(1, nb_timestep): output[:, i, :] = self.activations[i].forward( np.dot(input[:, i, :], self.W) + np.dot(output[:, i - 1, :], self.U) + self.b) self.last_output = output if self.return_sequence: return self.last_output else: return self.last_output[:, -1, :]
[docs] def backward(self, pre_grad, *args, **kwargs): self.dW = _zero(self.W.shape) self.dU = _zero(self.U.shape) self.db = _zero(self.b.shape) # hiddens.shape == (nb_timesteps, nb_batch, nb_out) hiddens = np.transpose(self.last_output, (1, 0, 2)) if self.return_sequence: # check shape pre_grad = np.transpose(pre_grad, (1, 0, 2)) assert hiddens.shape == pre_grad.shape nb_timesteps = pre_grad.shape[0] layer_grad = _zero(pre_grad.shape) for timestep1 in np.arange(nb_timesteps)[::-1]: delta = pre_grad[timestep1] * self.activations[timestep1].derivative() for timestep2 in np.arange(timestep1)[::-1]: self.dU += np.dot(hiddens[timestep2].T, delta) self.dW += np.dot(self.last_input[:, timestep2 + 1, :].T, delta) self.db += np.mean(delta, axis=0) if not self.first_layer: layer_grad[timestep2 + 1] += np.dot(delta, self.W.T) delta = np.dot(delta, self.U.T) if timestep1 == 0 or timestep2 == 0: self.dW += np.dot(self.last_input[:, 0, :].T, delta) self.db += np.mean(delta, axis=0) if not self.first_layer: layer_grad[0] += np.dot(delta, self.W.T) else: nb_timesteps = self.last_output.shape[1] nb_batchs = self.last_output.shape[0] assert (nb_batchs, self.last_output.shape[2]) == pre_grad.shape layer_grad = _zero(hiddens.shape) delta = pre_grad * self.activations[nb_timesteps - 1].derivative() for timestep2 in np.arange(nb_timesteps - 1)[::-1]: self.dU += np.dot(hiddens[timestep2].T, delta) self.dW += np.dot(self.last_input[:, timestep2 + 1, :].T, delta) self.db += np.mean(delta, axis=0) if not self.first_layer: layer_grad[timestep2 + 1] += np.dot(delta, self.W.T) delta = np.dot(delta, self.U.T) if timestep2 == 0: self.dW += np.dot(self.last_input[:, timestep2 + 1, :].T, delta) self.db += np.mean(delta, axis=0) if not self.first_layer: layer_grad[0] += np.dot(delta, self.W.T) if not self.first_layer: return np.transpose(layer_grad, (1, 0, 2))
@property def params(self): return self.W, self.U, self.b @property def grads(self): return self.dW, self.dU, self.db
[docs]class GRU(Recurrent): """Gated recurrent units (GRUs) are a gating mechanism in recurrent neural networks, introduced in 2014. Their performance on polyphonic music modeling and speech signal modeling was found to be similar to that of long short-term memory.[1]_ They have fewer parameters than LSTM, as they lack an output gate.[2]_ .. math:: z_t = \sigma(U_z x_t + W_z h_{t-1} + b_z) .. math:: z_t = r_t = \sigma(U_r x_t + W_r h_{t-1} + b_r) .. math:: h_t = tanh(U_h x_t + W_h (s_{t-1} \odot r_t) + b_h) .. math:: s_t = (1- z_t) \odot h_t + z_t \odot s_{t-1} Parameters ---------- gate_activation : npdl.activations.Activation Gate activation. need_grad : bool If `True`, will calculate gradients. References ---------- .. [1] Chung, Junyoung; Gulcehre, Caglar; Cho, KyungHyun; Bengio, Yoshua (2014). "Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling". arXiv:1412.3555 Freely accessible [cs.NE]. .. [2] "Recurrent Neural Network Tutorial, Part 4 – Implementing a GRU/LSTM RNN with Python and Theano – WildML". Wildml.com. Retrieved May 18, 2016. """ def __init__(self, gate_activation='sigmoid', need_grad=True, **kwargs): super(GRU, self).__init__(**kwargs) self.gate_activation_cls = activations.get(gate_activation).__class__ self.gate_activation = activations.get(gate_activation) self.need_grad = need_grad self.U_r, self.U_z, self.U_h = None, None, None self.W_r, self.W_z, self.W_h = None, None, None self.b_r, self.b_z, self.b_h = None, None, None self.grad_U_r, self.grad_U_z, self.grad_U_h = None, None, None self.grad_W_r, self.grad_W_z, self.grad_W_h = None, None, None self.grad_b_r, self.grad_b_z, self.grad_b_h = None, None, None
[docs] def connect_to(self, prev_layer=None): super(GRU, self).connect_to(prev_layer) # Weights matrices for input x self.U_r = self.init((self.n_in, self.n_out)) self.U_z = self.init((self.n_in, self.n_out)) self.U_h = self.init((self.n_in, self.n_out)) # Weights matrices for memory cell self.W_r = self.inner_init((self.n_out, self.n_out)) self.W_z = self.inner_init((self.n_out, self.n_out)) self.W_h = self.inner_init((self.n_out, self.n_out)) # Biases self.b_r = _zero((self.n_out,)) self.b_z = _zero((self.n_out,)) self.b_h = _zero((self.n_out,))
[docs] def forward(self, input, *args, **kwargs): assert np.ndim(input) == 3, 'Only support batch training.' # record self.last_input = input # dim nb_batch, nb_timesteps, nb_in = input.shape # outputs output = _zero((nb_batch, nb_timesteps, self.n_out)) # forward for i in range(nb_timesteps): # data s_pre = _zero((nb_batch, self.n_out)) if i == 0 else output[:, i - 1, :] x_now = input[:, i, :] # computation z_now = self.gate_activation.forward(np.dot(x_now, self.U_z) + np.dot(s_pre, self.W_z) + self.b_z) r_now = self.gate_activation.forward(np.dot(x_now, self.U_r) + np.dot(s_pre, self.W_r) + self.b_r) h_now = self.activation.forward(np.dot(x_now, self.U_h) + np.dot(s_pre * r_now, self.W_h) + self.b_h) output[:, i, :] = (1 - z_now) * h_now + z_now * s_pre # record self.last_output = output # return if self.return_sequence: return self.last_output else: return self.last_output[:, -1, :]
[docs] def backward(self, pre_grad, *args, **kwargs): raise NotImplementedError
@property def params(self): return self.U_r, self.U_z, self.U_h, \ self.W_r, self.W_z, self.W_h, \ self.b_r, self.b_z, self.b_h @property def grads(self): return self.grad_U_r, self.grad_U_z, self.grad_U_h, \ self.grad_W_r, self.grad_W_z, self.grad_W_h, \ self.grad_b_r, self.grad_b_z, self.grad_b_h
[docs]class LSTM(Recurrent): """Bacth LSTM, support mask, but not support training. Long short-term memory (LSTM) is a recurrent neural network (RNN) architecture (an artificial neural network) proposed in 1997 by Sepp Hochreiter and Jürgen Schmidhuber [1]_ and further improved in 2000 by Felix Gers et al.[2]_ Like most RNNs, a LSTM network is universal in the sense that given enough network units it can compute anything a conventional computer can compute, provided it has the proper weight matrix, which may be viewed as its program. .. math:: f_t = \sigma(U_f x_t + W_f h_{t-1} + b_f) .. math:: i_t = \sigma(U_i x_t + W_i h_{t-1} + b_f) .. math:: o_t = \sigma(U_o x_t + W_o h_{t-1} + b_h) .. math:: g_t = tanh(U_g x_t + W_g h_{t-1} + b_g) .. math:: c_t = f_t \odot c_{t-1} + i_t \odot g_t .. math:: h_t = o_t \odot tanh(c_t) Parameters ---------- gate_activation : npdl.activations.Activation Gate activation. need_grad : bool If `True`, will calculate gradients. forget_bias_num : int integer. References ---------- .. [1] Sepp Hochreiter; Jürgen Schmidhuber (1997). "Long short-term memory". Neural Computation. 9 (8): 1735–1780. doi:10.1162/ne co.1997.9.8.1735. PMID 9377276. .. [2] Felix A. Gers; Jürgen Schmidhuber; Fred Cummins (2000). "Learning to Forget: Continual Prediction with LSTM". Neural Computation. 12 (10): 2451–2471. doi:10.1162/089976600300015015. """ def __init__(self, gate_activation="sigmoid", need_grad=True, forget_bias_num=1, **kwargs): super(LSTM, self).__init__(**kwargs) self.gate_activation_cls = activations.get(gate_activation).__class__ self.gate_activation = activations.get(gate_activation) self.need_grad = need_grad self.forget_bias_num = forget_bias_num self.U_g, self.U_i, self.U_f, self.U_o = None, None, None, None self.W_g, self.W_i, self.W_f, self.W_o = None, None, None, None self.b_g, self.b_i, self.b_f, self.b_o = None, None, None, None self.grad_U_g, self.grad_U_i, self.grad_U_f, self.grad_U_o = None, None, None, None self.grad_W_g, self.grad_W_i, self.grad_W_f, self.grad_W_o = None, None, None, None self.grad_b_g, self.grad_b_i, self.grad_b_f, self.grad_b_o = None, None, None, None self.c0, self.h0 = None, None self.last_cell = None
[docs] def connect_to(self, prev_layer=None): super(LSTM, self).connect_to(prev_layer) # Weights matrices for input x self.U_g = self.init((self.n_in, self.n_out)) self.U_i = self.init((self.n_in, self.n_out)) self.U_f = self.init((self.n_in, self.n_out)) self.U_o = self.init((self.n_in, self.n_out)) # Weights matrices for memory cell self.W_g = self.inner_init((self.n_out, self.n_out)) self.W_i = self.inner_init((self.n_out, self.n_out)) self.W_f = self.inner_init((self.n_out, self.n_out)) self.W_o = self.inner_init((self.n_out, self.n_out)) # Biases self.b_g = _zero((self.n_out,)) self.b_i = _zero((self.n_out,)) self.b_f = _one((self.n_out,)) * self.forget_bias_num self.b_o = _zero((self.n_out,))
[docs] def forward(self, input, mask, c0=None, h0=None): assert np.ndim(input) == 3, 'Only support batch training.' # record self.last_input = input # dim nb_batch, nb_timesteps, nb_in = input.shape # data output = _zero((nb_batch, nb_timesteps, self.n_out)) cell = _zero((nb_batch, nb_timesteps, self.n_out)) self.c0 = _zero((nb_batch, self.n_out)) if c0 is None else c0 self.h0 = _zero((nb_batch, self.n_out)) if h0 is None else h0 # forward for t in range(nb_timesteps): # data h_pre = self.h0 if t == 0 else output[:, t - 1, :] c_pre = self.c0 if t == 0 else cell[:, t - 1, :] x_now = input[:, t, :] m_now = mask[:, t] # computation f = self.gate_activation.forward(np.dot(x_now, self.U_f) + np.dot(h_pre, self.W_f) + self.b_f) i = self.gate_activation.forward(np.dot(x_now, self.U_i) + np.dot(h_pre, self.W_i) + self.b_i) o = self.gate_activation.forward(np.dot(x_now, self.U_o) + np.dot(h_pre, self.W_o) + self.b_o) g = self.activation.forward(np.dot(x_now, self.U_g) + np.dot(h_pre, self.W_g) + self.b_g) c = f * c_pre + i * g c = m_now[:, None] * c + (1.0 - m_now)[:, None] * c_pre h = o * self.activation.forward(c) h = m_now[:, None] * h + (1.0 - m_now)[:, None] * h_pre # record self.h0 = h self.c0 = c output[:, t, :] = h cell[:, t, :] = c # record self.last_output = output self.last_cell = cell # return if self.return_sequence: return self.last_output else: return self.last_output[:, -1, :]
[docs] def backward(self, pre_grad, *args, **kwargs): # reset self.grad_W_g = _zero(self.W_g.shape) self.grad_W_i = _zero(self.W_i.shape) self.grad_W_f = _zero(self.W_f.shape) self.grad_W_o = _zero(self.W_o.shape) self.grad_U_g = _zero(self.U_g.shape) self.grad_U_i = _zero(self.U_i.shape) self.grad_U_f = _zero(self.U_f.shape) self.grad_U_o = _zero(self.U_o.shape) self.grad_b_g = _zero(self.b_g.shape) self.grad_b_i = _zero(self.b_i.shape) self.grad_b_f = _zero(self.b_f.shape) self.grad_b_o = _zero(self.b_o.shape) # backward raise NotImplementedError
@property def params(self): return self.U_g, self.U_i, self.U_f, self.U_o, \ self.W_g, self.W_i, self.W_f, self.W_o, \ self.b_g, self.b_i, self.b_f, self.b_o @property def grads(self): return self.grad_U_g, self.grad_U_i, self.grad_U_f, self.grad_U_o, \ self.grad_W_g, self.grad_W_i, self.grad_W_f, self.grad_W_o, \ self.grad_b_g, self.grad_b_i, self.grad_b_f, self.grad_b_o
[docs]class BatchLSTM(Recurrent): """Batch LSTM, support training, but not support mask. Parameters ---------- gate_activation : npdl.activations.Activation Gate activation. need_grad : bool If `True`, will calculate gradients. forget_bias_num : int integer. References ---------- .. [1] Sepp Hochreiter; Jürgen Schmidhuber (1997). "Long short-term memory". Neural Computation. 9 (8): 1735–1780. doi:10.1162/ne co.1997.9.8.1735. PMID 9377276. .. [2] Felix A. Gers; Jürgen Schmidhuber; Fred Cummins (2000). "Learning to Forget: Continual Prediction with LSTM". Neural Computation. 12 (10): 2451–2471. doi:10.1162/089976600300015015. """ def __init__(self, gate_activation='sigmoid', need_grad=True, forget_bias_num=1, **kwargs): super(BatchLSTM, self).__init__(**kwargs) self.gate_activation_cls = activations.get(gate_activation).__class__ self.gate_activation = activations.get(gate_activation) self.need_grad = need_grad self.forget_bias_num = forget_bias_num self.AllW, self.d_AllW = None, None self.c0, self.d_c0 = None, None self.h0, self.d_h0 = None, None self.IFOGf = None self.IFOG = None self.Hin = None self.Ct = None self.C = None
[docs] def connect_to(self, prev_layer=None): """Connection to the previous layer. Parameters ---------- prev_layer : npdl.layers.Layer or None Previous layer. AllW : numpy.array ===== ==== === === === type i f o g ----- ---- --- --- --- bias x2h h2h ===== ==== === === === """ super(BatchLSTM, self).connect_to(prev_layer) n_in = self.n_in n_out = self.n_out # init weights self.AllW = _zero((n_in + n_out + 1, 4 * n_out)) # bias if self.forget_bias_num != 0: self.AllW[0, self.n_out: 2 * self.n_out] = self.forget_bias_num # Weights matrices for input x self.AllW[1:n_in + 1, n_out * 0:n_out * 1] = self.init((n_in, n_out)) self.AllW[1:n_in + 1, n_out * 1:n_out * 2] = self.init((n_in, n_out)) self.AllW[1:n_in + 1, n_out * 2:n_out * 3] = self.init((n_in, n_out)) self.AllW[1:n_in + 1, n_out * 3:n_out * 4] = self.init((n_in, n_out)) # Weights matrices for memory cell self.AllW[n_in + 1:, n_out * 0:n_out * 1] = self.inner_init((n_out, n_out)) self.AllW[n_in + 1:, n_out * 1:n_out * 2] = self.inner_init((n_out, n_out)) self.AllW[n_in + 1:, n_out * 2:n_out * 3] = self.inner_init((n_out, n_out)) self.AllW[n_in + 1:, n_out * 3:n_out * 4] = self.inner_init((n_out, n_out))
[docs] def forward(self, input, c0=None, h0=None): """Forward propagation. Parameters ---------- input : numpy.array input should be of shape (nb_batch,nb_seq,n_in) c0 : numpy.array or None init cell state h0 : numpy.array or None init hidden state Returns ------- numpy.array Forward results. """ # checking assert np.ndim(input) == 3, 'Only support batch training.' assert input.shape[2] == self.n_in # shape nb_batch, nb_seq, n_in = input.shape self.nb_batch = nb_batch self.nb_seq = nb_seq # data input = np.transpose(input, (1, 0, 2)) self.c0 = _zero((nb_batch, self.n_out)) if c0 is None else c0 self.h0 = _zero((nb_batch, self.n_out)) if h0 is None else h0 # Perform the LSTM forward pass with X as the input # # x plus h plus bias, lol xphpb = self.AllW.shape[0] # input [1, xt, ht-1] to each tick of the LSTM Hin = _zero((nb_seq, nb_batch, xphpb)) # hidden representation of the LSTM (gated cell content) Hout = _zero((nb_seq, nb_batch, self.n_out)) # input, forget, output, gate (IFOG) IFOG = _zero((nb_seq, nb_batch, self.n_out * 4)) # after nonlinearity IFOGf = _zero((nb_seq, nb_batch, self.n_out * 4)) # cell content C = _zero((nb_seq, nb_batch, self.n_out)) # tanh of cell content Ct = _zero((nb_seq, nb_batch, self.n_out)) for t in range(nb_seq): # concat [x,h] as input to the LSTM prevh = Hout[t - 1] if t > 0 else self.h0 # bias Hin[t, :, 0] = 1 Hin[t, :, 1:n_in + 1] = input[t] Hin[t, :, n_in + 1:] = prevh # compute all gate activations. dots: (most work is this line) IFOG[t] = Hin[t].dot(self.AllW) # non-linearities # sigmoids; these are the gates IFOGf[t, :, :3 * self.n_out] = 1.0 / (1.0 + np.exp(-IFOG[t, :, :3 * self.n_out])) # tanh IFOGf[t, :, 3 * self.n_out:] = np.tanh(IFOG[t, :, 3 * self.n_out:]) # compute the cell activation prevc = C[t - 1] if t > 0 else self.c0 C[t] = IFOGf[t, :, :self.n_out] * IFOGf[t, :, 3 * self.n_out:] + \ IFOGf[t, :, self.n_out:2 * self.n_out] * prevc Ct[t] = np.tanh(C[t]) Hout[t] = IFOGf[t, :, 2 * self.n_out:3 * self.n_out] * Ct[t] # record self.last_output = np.transpose(Hout, (1, 0, 2)) self.IFOGf = IFOGf self.IFOG = IFOG self.Hin = Hin self.Ct = Ct self.C = C if self.return_sequence: return self.last_output else: return self.last_output[:, -1, :]
[docs] def backward(self, pre_grad, dcn=None, dhn=None): """Backward propagation. Parameters ---------- pre_grad : numpy.array Gradients propagated to this layer. dcn : numpy.array Gradients of cell state at `n` time step. dhn : numpy.array Gradients of hidden state at `n` time step. Returns ------- numpy.array The gradients propagated to previous layer. """ Hout = np.transpose(self.last_output, (1, 0, 2)) nb_seq, batch_size, n_out = Hout.shape input_size = self.AllW.shape[0] - n_out - 1 # -1 due to bias self.d_AllW = _zero(self.AllW.shape) self.d_h0 = _zero((batch_size, n_out)) # backprop the LSTM dIFOG = _zero(self.IFOG.shape) dIFOGf = _zero(self.IFOGf.shape) dHin = _zero(self.Hin.shape) dC = _zero(self.C.shape) layer_grad = _zero((nb_seq, batch_size, input_size)) # make a copy so we don't have any funny side effects # prepare layer gradients if self.return_sequence: timesteps = list(range(nb_seq))[::-1] assert np.ndim(pre_grad) == 3 else: timesteps = [nb_seq - 1] assert np.ndim(pre_grad) == 2 tmp = _zero((self.nb_batch, self.nb_seq, self.n_out)) tmp[:, -1, :] = pre_grad pre_grad = tmp dHout = np.transpose(pre_grad, (1, 0, 2)).copy() # carry over gradients from later if dcn is not None: dC[nb_seq - 1] += dcn.copy() if dhn is not None: dHout[nb_seq - 1] += dhn.copy() for t in timesteps: tanhCt = self.Ct[t] dIFOGf[t, :, 2 * n_out:3 * n_out] = tanhCt * dHout[t] # backprop tanh non-linearity first then continue backprop dC[t] += (1 - tanhCt ** 2) * (self.IFOGf[t, :, 2 * n_out:3 * n_out] * dHout[t]) if t > 0: dIFOGf[t, :, n_out:2 * n_out] = self.C[t - 1] * dC[t] dC[t - 1] += self.IFOGf[t, :, n_out:2 * n_out] * dC[t] else: dIFOGf[t, :, n_out:2 * n_out] = self.c0 * dC[t] self.d_c0 = self.IFOGf[t, :, n_out:2 * n_out] * dC[t] dIFOGf[t, :, :n_out] = self.IFOGf[t, :, 3 * n_out:] * dC[t] dIFOGf[t, :, 3 * n_out:] = self.IFOGf[t, :, :n_out] * dC[t] # backprop activation functions dIFOG[t, :, 3 * n_out:] = (1 - self.IFOGf[t, :, 3 * n_out:] ** 2) * dIFOGf[t, :, 3 * n_out:] y = self.IFOGf[t, :, :3 * n_out] dIFOG[t, :, :3 * n_out] = (y * (1.0 - y)) * dIFOGf[t, :, :3 * n_out] # backprop matrix multiply self.d_AllW += np.dot(self.Hin[t].transpose(), dIFOG[t]) dHin[t] = dIFOG[t].dot(self.AllW.transpose()) # backprop the identity transforms into Hin layer_grad[t] = dHin[t, :, 1:input_size + 1] if t > 0: dHout[t - 1, :] += dHin[t, :, input_size + 1:] else: self.d_h0 += dHin[t, :, input_size + 1:] layer_grad = np.transpose(layer_grad, (1, 0, 2)) return layer_grad
@property def params(self): return [self.AllW, ] @property def grads(self): return [self.d_AllW, ]