DL|Coursera-Deep Learning Specialization 课程之（五）（Sequence Models: -weak1编程作业） Coursera|RNN|LSTM|python

Building your Recurrent Neural Network - Step by Step 1 - Forward propagation for the basic Recurrent Neural Network 1.1 - RNN cell

# GRADED FUNCTION: rnn_cell_forwarddef rnn_cell_forward(xt, a_prev, parameters): """ Implements a single forward step of the RNN-cell as described in Figure (2)Arguments: xt -- your input data at timestep "t", numpy array of shape (n_x, m). a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m) parameters -- python dictionary containing: Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x) Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a) Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a) ba --Bias, numpy array of shape (n_a, 1) by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1) Returns: a_next -- next hidden state, of shape (n_a, m) yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m) cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters) """# Retrieve parameters from "parameters" Wax = parameters["Wax"] Waa = parameters["Waa"] Wya = parameters["Wya"] ba = parameters["ba"] by = parameters["by"]### START CODE HERE ### (≈2 lines) # compute next activation state using the formula given above a_next = np.tanh(np.dot(Waa,a_prev)+np.dot(Wax,xt)+ba) # compute output of the current cell using the formula given above yt_pred = softmax(np.dot(Wya,a_next)+by) ### END CODE HERE #### store values you need for backward propagation in cache cache = (a_next, a_prev, xt, parameters)return a_next, yt_pred, cache

np.random.seed(1) xt = np.random.randn(3,10) a_prev = np.random.randn(5,10) Waa = np.random.randn(5,5) Wax = np.random.randn(5,3) Wya = np.random.randn(2,5) ba = np.random.randn(5,1) by = np.random.randn(2,1) parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters) print("a_next[4] = ", a_next[4]) print("a_next.shape = ", a_next.shape) print("yt_pred[1] =", yt_pred[1]) print("yt_pred.shape = ", yt_pred.shape)

a_next[4] = [ 0.59584544 0.18141802 0.61311866 0.99808218 0.85016201 0.99980978
-0.18887155 0.99815551 0.6531151 0.82872037]
a_next.shape = (5, 10)
yt_pred[1] = [ 0.9888161 0.01682021 0.21140899 0.36817467 0.98988387 0.88945212
0.36920224 0.9966312 0.9982559 0.17746526]
yt_pred.shape = (2, 10)
1.2 - RNN forward pass

# GRADED FUNCTION: rnn_forwarddef rnn_forward(x, a0, parameters): """ Implement the forward propagation of the recurrent neural network described in Figure (3).Arguments: x -- Input data for every time-step, of shape (n_x, m, T_x). a0 -- Initial hidden state, of shape (n_a, m) parameters -- python dictionary containing: Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a) Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x) Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a) ba --Bias numpy array of shape (n_a, 1) by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)Returns: a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x) y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x) caches -- tuple of values needed for the backward pass, contains (list of caches, x) """# Initialize "caches" which will contain the list of all caches caches = []# Retrieve dimensions from shapes of x and Wy n_x, m, T_x = x.shape n_y, n_a = parameters["Wya"].shape### START CODE HERE #### initialize "a" and "y" with zeros (≈2 lines) a = np.zeros((n_a,m,T_x)) y_pred = np.zeros((n_y,m,T_x))# Initialize a_next (≈1 line) a_next = a0# loop over all time-steps for t in range(T_x): # Update next hidden state, compute the prediction, get the cache (≈1 line) a_next, yt_pred, cache = rnn_cell_forward(x[:,:,t],a_next, parameters) # Save the value of the new "next" hidden state in a (≈1 line) a[:,:,t] = a_next # Save the value of the prediction in y (≈1 line) y_pred[:,:,t] = yt_pred # Append "cache" to "caches" (≈1 line) caches.append(cache)### END CODE HERE #### store values needed for backward propagation in cache caches = (caches, x)return a, y_pred, caches

np.random.seed(1) x = np.random.randn(3,10,4) a0 = np.random.randn(5,10) Waa = np.random.randn(5,5) Wax = np.random.randn(5,3) Wya = np.random.randn(2,5) ba = np.random.randn(5,1) by = np.random.randn(2,1) parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}a, y_pred, caches = rnn_forward(x, a0, parameters) print("a[4][1] = ", a[4][1]) print("a.shape = ", a.shape) print("y_pred[1][3] =", y_pred[1][3]) print("y_pred.shape = ", y_pred.shape) print("caches[1][1][3] =", caches[1][1][3]) print("len(caches) = ", len(caches))

a[4][1] = [-0.99999375 0.77911235 -0.99861469 -0.99833267]
a.shape = (5, 10, 4)
y_pred[1][3] = [ 0.79560373 0.86224861 0.11118257 0.81515947]
y_pred.shape = (2, 10, 4)
caches[1][1][3] = [-1.1425182 -0.34934272 -0.20889423 0.58662319]
len(caches) = 2
2 - Long Short-Term Memory (LSTM) network 2.1 - LSTM cell

def lstm_cell_forward(xt, a_prev, c_prev, parameters): # Retrieve parameters from "parameters" Wf = parameters["Wf"] bf = parameters["bf"] Wi = parameters["Wi"] bi = parameters["bi"] Wc = parameters["Wc"] bc = parameters["bc"] Wo = parameters["Wo"] bo = parameters["bo"] Wy = parameters["Wy"] by = parameters["by"]# Retrieve dimensions from shapes of xt and Wy n_x, m = xt.shape n_y, n_a = Wy.shape### START CODE HERE ### # Concatenate a_prev and xt (≈3 lines) concat=np.zeros((n_a+n_x,m)) concat[:n_a,:] = a_prev concat[n_a:,:] = xt# Compute values for ft, it, cct, c_next, ot, a_next using the formulas given figure (4) (≈6 lines) ft =sigmoid(np.dot(Wf,concat)+bf) it = sigmoid(np.dot(Wi,concat)+bi) cct =np.tanh(np.dot(Wc,concat)+bc) c_next = ft*c_prev+it*cct ot =sigmoid(np.dot(Wo,concat)+bo) a_next =ot*np.tanh(c_next)# Compute prediction of the LSTM cell (≈1 line) yt_pred =softmax(np.dot(Wy,a_next)+by) ### END CODE HERE #### store values needed for backward propagation in cache cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)return a_next, c_next, yt_pred, cache

np.random.seed(1) xt = np.random.randn(3,10) a_prev = np.random.randn(5,10) c_prev = np.random.randn(5,10) Wf = np.random.randn(5, 5+3) bf = np.random.randn(5,1) Wi = np.random.randn(5, 5+3) bi = np.random.randn(5,1) Wo = np.random.randn(5, 5+3) bo = np.random.randn(5,1) Wc = np.random.randn(5, 5+3) bc = np.random.randn(5,1) Wy = np.random.randn(2,5) by = np.random.randn(2,1)parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}a_next, c_next, yt, cache = lstm_cell_forward(xt, a_prev, c_prev, parameters) print("a_next[4] = ", a_next[4]) print("a_next.shape = ", c_next.shape) print("c_next[2] = ", c_next[2]) print("c_next.shape = ", c_next.shape) print("yt[1] =", yt[1]) print("yt.shape = ", yt.shape) print("cache[1][3] =", cache[1][3]) print("len(cache) = ", len(cache))

a_next[4] = [-0.66408471 0.0036921 0.02088357 0.22834167 -0.85575339 0.00138482
0.76566531 0.34631421 -0.00215674 0.43827275]
a_next.shape = (5, 10)
c_next[2] = [ 0.63267805 1.00570849 0.35504474 0.20690913 -1.64566718 0.11832942
0.76449811 -0.0981561 -0.74348425 -0.26810932]
c_next.shape = (5, 10)
yt[1] = [ 0.79913913 0.15986619 0.22412122 0.15606108 0.97057211 0.31146381
0.00943007 0.12666353 0.39380172 0.07828381]
yt.shape = (2, 10)
cache[1][3] = [-0.16263996 1.03729328 0.72938082 -0.54101719 0.02752074 -0.30821874
0.07651101 -1.03752894 1.41219977 -0.37647422]
len(cache) = 10
2.2 - Forward pass for LSTM

def lstm_forward(x, a0, parameters): # Initialize "caches", which will track the list of all the caches caches = []### START CODE HERE ### # Retrieve dimensions from shapes of xt and Wy (≈2 lines) n_x, m, T_x = x.shape n_y, n_a = parameters["Wy"].shape# initialize "a", "c" and "y" with zeros (≈3 lines) a = np.zeros((n_a, m, T_x)) c = np.zeros((a.shape)) y = np.zeros((n_y, m, T_x))# Initialize a_next and c_next (≈2 lines) a_next = a0 c_next = c[:,:,0]# loop over all time-steps for t in range(T_x): # Update next hidden state, next memory state, compute the prediction, get the cache (≈1 line) a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t], a_next,c_next, parameters) # Save the value of the new "next" hidden state in a (≈1 line) a[:,:,t] = a_next # Save the value of the prediction in y (≈1 line) y[:,:,t] = yt # Save the value of the next cell state (≈1 line) c[:,:,t]= c_next # Append the cache into caches (≈1 line) caches.append(cache)### END CODE HERE #### store values needed for backward propagation in cache caches = (caches, x)return a, y, c, caches

np.random.seed(1) x = np.random.randn(3,10,7) a0 = np.random.randn(5,10) Wf = np.random.randn(5, 5+3) bf = np.random.randn(5,1) Wi = np.random.randn(5, 5+3) bi = np.random.randn(5,1) Wo = np.random.randn(5, 5+3) bo = np.random.randn(5,1) Wc = np.random.randn(5, 5+3) bc = np.random.randn(5,1) Wy = np.random.randn(2,5) by = np.random.randn(2,1)parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}a, y, c, caches = lstm_forward(x, a0, parameters) print("a[4][3][6] = ", a[4][3][6]) print("a.shape = ", a.shape) print("y[1][4][3] =", y[1][4][3]) print("y.shape = ", y.shape) print("caches[1][1[1]] =", caches[1][1][1]) print("c[1][2][1]", c[1][2][1]) print("len(caches) = ", len(caches))

a[4][3][6] = 0.172117767533
a.shape = (5, 10, 7)
y[1][4][3] = 0.95087346185
y.shape = (2, 10, 7)
caches[1][1[1]] = [ 0.82797464 0.23009474 0.76201118 -0.22232814 -0.20075807 0.18656139
0.41005165]
c[1][2][1] -0.855544916718
len(caches) = 2
3 - Backpropagation in recurrent neural networks (OPTIONAL / UNGRADED) 3.1 - Basic RNN backward pass

def rnn_cell_backward(da_next, cache): # Retrieve values from cache (a_next, a_prev, xt, parameters) = cache# Retrieve values from parameters Wax = parameters["Wax"] Waa = parameters["Waa"] Wya = parameters["Wya"] ba = parameters["ba"] by = parameters["by"]### START CODE HERE ### # compute the gradient of tanh with respect to a_next (≈1 line) dtanh = (1 - a_next ** 2) * da_next# compute the gradient of the loss with respect to Wax (≈2 lines) dxt = np.dot(Wax.T, dtanh) dWax = np.dot(dtanh, xt.T)# compute the gradient with respect to Waa (≈2 lines) da_prev = np.dot(Waa.T, dtanh) dWaa = np.dot(dtanh, da_prev.T)# compute the gradient with respect to b (≈1 line) dba = np.sum(dtanh, axis=1, keepdims=True)### END CODE HERE #### Store the gradients in a python dictionary gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}return gradients

gradients["dxt"][1][2] = -0.460564103059 gradients["dxt"].shape = (3, 10) gradients["da_prev"][2][3] = 0.0842968653807 gradients["da_prev"].shape = (5, 10) gradients["dWax"][3][1] = 0.393081873922 gradients["dWax"].shape = (5, 3) gradients["dWaa"][1][2] = 0.253572776461 gradients["dWaa"].shape = (5, 5) gradients["dba"][4] = [ 0.80517166] gradients["dba"].shape = (5, 1)

def rnn_backward(da, caches): # Retrieve values from the first cache (t=1) of caches (≈2 lines) (caches, x) = caches (a1, a0, x1, parameters) = caches[0]# Retrieve dimensions from da's and x1's shapes (≈2 lines) n_a, m, T_x = da.shape n_x, m = x1.shape# initialize the gradients with the right sizes (≈6 lines) dx = np.zeros((n_x,m,T_x)) dWax = np.zeros((n_a,n_x)) dWaa = np.zeros((n_a,n_a)) dba = np.zeros((n_a,1)) da0 = np.zeros((n_a,m)) da_prevt =np.zeros((n_a,m)) # Loop through all the time steps for t in reversed(range(T_x)): # Compute gradients at time step t. Choose wisely the "da_next" and the "cache" to use in the backward propagation step. (≈1 line) gradients = rnn_cell_backward(da[:,:,t]+da_prevt,caches[t]) # Retrieve derivatives from gradients (≈ 1 line) dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"] # Increment global derivatives w.r.t parameters by adding their derivative at time-step t (≈4 lines) dx[:, :, t] = dxt dWax += dWaxt dWaa += dWaat dba += dbat# Set da0 to the gradient of a which has been backpropagated through all time-steps (≈1 line) da0 = da_prevt ### END CODE HERE #### Store the gradients in a python dictionary gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}return gradients

gradients["dx"][1][2] = [-2.07101689 -0.592556270.024668550.01483317] gradients["dx"].shape = (3, 10, 4) gradients["da0"][2][3] = -0.314942375127 gradients["da0"].shape = (5, 10) gradients["dWax"][3][1] = 11.2641044965 gradients["dWax"].shape = (5, 3) gradients["dWaa"][1][2] = 5.60884278841 gradients["dWaa"].shape = (5, 5) gradients["dba"][4] = [-0.74747722] gradients["dba"].shape = (5, 1)

3.2 - LSTM backward pass 3.2.3 parameter derivatives

def lstm_cell_backward(da_next, dc_next, cache): # Retrieve information from "cache" (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache### START CODE HERE ### # Retrieve dimensions from xt's and a_next's shape (≈2 lines) n_x, m = xt.shape n_a, m = a_next.shape# Compute gates related derivatives, you can find their values can be found by looking carefully at equations (7) to (10) (≈4 lines) dot = da_next * np.tanh(c_next) * ot * (1-ot) dcct = (dc_next*it+ot*(1-np.square(np.tanh(c_next)))*it*da_next)*(1-np.square(cct)) dit = (dc_next*cct+ot*(1-np.square(np.tanh(c_next)))*cct*da_next)*it*(1-it) dft = (dc_next*c_prev+ot*(1-np.square(np.tanh(c_next)))*c_prev*da_next)*ft*(1-ft) # Code equations (7) to (10) (≈4 lines) dit = (dc_next*cct+ot*(1-np.square(np.tanh(c_next)))*cct*da_next)*it*(1-it) dft = (dc_next*c_prev+ot*(1-np.square(np.tanh(c_next)))*c_prev*da_next)*ft*(1-ft) dot = da_next * np.tanh(c_next) * ot * (1-ot) dcct =(dc_next*it+ot*(1-np.square(np.tanh(c_next)))*it*da_next)*(1-np.square(cct)) # Compute parameters related derivatives. Use equations (11)-(14) (≈8 lines) dWf = np.dot(dft, np.concatenate((a_prev, xt), axis=0).T) dWi = np.dot(dit, np.concatenate((a_prev, xt), axis=0).T) dWc = np.dot(dcct, np.concatenate((a_prev, xt), axis=0).T) dWo =np.dot(dot, np.concatenate((a_prev, xt), axis=0).T) dbf = np.sum(dft, axis=1, keepdims=True) dbi = np.sum(dit, axis=1, keepdims=True) dbc = np.sum(dcct, axis=1, keepdims=True) dbo =np.sum(dot, axis=1, keepdims=True)# Compute derivatives w.r.t previous hidden state, previous memory state and input. Use equations (15)-(17). (≈3 lines) da_prev = np.dot(parameters['Wf'][:,:n_a].T, dft) + np.dot(parameters['Wi'][:,:n_a].T, dit) + np.dot(parameters['Wc'][:,:n_a].T, dcct) + np.dot(parameters['Wo'][:,:n_a].T, dot) dc_prev = dc_next*ft + ot*(1-np.square(np.tanh(c_next)))*ft*da_next dxt = np.dot(parameters['Wf'][:,n_a:].T,dft)+np.dot(parameters['Wi'][:,n_a:].T,dit)+np.dot(parameters['Wc'][:,n_a:].T,dcct)+np.dot(parameters['Wo'][:,n_a:].T,dot) ### END CODE HERE #### Save gradients in dictionary gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi, "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}return gradients

gradients[“dxt”][1][2] = 3.23055911511
gradients[“dxt”].shape = (3, 10)
gradients[“da_prev”][2][3] = -0.0639621419711
gradients[“da_prev”].shape = (5, 10)
gradients[“dc_prev”][2][3] = 0.797522038797
gradients[“dc_prev”].shape = (5, 10)
gradients[“dWf”][3][1] = -0.147954838164
gradients[“dWf”].shape = (5, 8)
gradients[“dWi”][1][2] = 1.05749805523
gradients[“dWi”].shape = (5, 8)
gradients[“dWc”][3][1] = 2.30456216369
gradients[“dWc”].shape = (5, 8)
gradients[“dWo”][1][2] = 0.331311595289
gradients[“dWo”].shape = (5, 8)
gradients[“dbf”][4] = [ 0.18864637]
gradients[“dbf”].shape = (5, 1)
gradients[“dbi”][4] = [-0.40142491]
gradients[“dbi”].shape = (5, 1)
gradients[“dbc”][4] = [ 0.25587763]
gradients[“dbc”].shape = (5, 1)
gradients[“dbo”][4] = [ 0.13893342]
gradients[“dbo”].shape = (5, 1)
3.3 Backward pass through the LSTM RNN

def lstm_backward(da, caches): ### START CODE HERE ### # Retrieve dimensions from da's and x1's shapes (≈2 lines) n_a, m, T_x = da.shape n_x, m = x1.shape# initialize the gradients with the right sizes (≈12 lines) dx = np.zeros((n_x, m, T_x)) da0 = np.zeros((n_a, m)) da_prevt = np.zeros((n_a, m)) dc_prevt = np.zeros((n_a, m)) dWf = np.zeros((n_a, n_a+n_x)) dWi = np.zeros((n_a, n_a+n_x)) dWc = np.zeros((n_a, n_a+n_x)) dWo = np.zeros((n_a, n_a+n_x)) dbf = np.zeros((n_a, 1)) dbi = np.zeros((n_a, 1)) dbc = np.zeros((n_a, 1)) dbo = np.zeros((n_a, 1)) # loop back over the whole sequence for t in reversed(range(T_x)): # Compute all gradients using lstm_cell_backward gradients = lstm_cell_backward(da[:, :, t] + da_prevt, dc_prevt, caches[t]) # Store or add the gradient to the parameters' previous step's gradient dx[:,:,t] = gradients['dxt'] dWf = dWf + gradients['dWf'] dWi = dWi + gradients['dWi'] dWc = dWc + gradients['dWc'] dWo = dWo + gradients['dWo'] dbf = dbf + gradients['dbf'] dbi = dbi + gradients['dbi'] dbc = dbc + gradients['dbc'] dbo = dbo + gradients['dbo'] # Set the first activation's gradient to the backpropagated gradient da_prev. da0 = gradients['da_prev']### END CODE HERE #### Store the gradients in a python dictionary gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi, "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}return gradients

【DL|Coursera-Deep Learning Specialization 课程之（五）（Sequence Models: -weak1编程作业）】gradients[“dx”][1][2] = [-0.00057129 0.08287442 -0.30545663 -0.43281115]
gradients[“dx”].shape = (3, 10, 4)
gradients[“da0”][2][3] = -0.0979986136214
gradients[“da0”].shape = (5, 10)
gradients[“dWf”][3][1] = -0.155977272872
gradients[“dWf”].shape = (5, 8)
gradients[“dWi”][1][2] = 0.102371820249
gradients[“dWi”].shape = (5, 8)
gradients[“dWc”][3][1] = -0.0624983794927
gradients[“dWc”].shape = (5, 8)
gradients[“dWo”][1][2] = 0.0484389131444
gradients[“dWo”].shape = (5, 8)
gradients[“dbf”][4] = [ 0.00818495]
gradients[“dbf”].shape = (5, 1)
gradients[“dbi”][4] = [-0.15399065]
gradients[“dbi”].shape = (5, 1)
gradients[“dbc”][4] = [-0.29691142]
gradients[“dbc”].shape = (5, 1)
gradients[“dbo”][4] = [-0.29798344]
gradients[“dbo”].shape = (5, 1)