deep|Andrew Ng, deeplearning. Course2 week2,Optimization

1、code

import numpy as np import matplotlib.pyplot as plt import scipy.io import math import sklearn import sklearn.datasetsimport opt_utils #参见数据包或者在本文底部copy import testCase#参见数据包或者在本文底部copy#%matplotlib inline #如果你用的是Jupyter Notebook请取消注释 plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray'def update_parameters_with_gd(parameters, grads, learning_rate): L = len(parameters) // 2 for l in range(L): parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)] parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)] return parametersdef random_mini_batches(X, Y, mini_batch_size = 64, seed = 0): np.random.seed(seed) m = Y.shape[1] mini_batches = [ ] #permute permutation = list(np.random.permutation(m)) shuffled_X = X[:,permutation] #np.array 的切片特性 shuffled_Y = Y[:,permutation].reshape(1,m ) #cut num_complete_minibatches = m // mini_batch_size for k in range(num_complete_minibatches): mini_batch_X = shuffled_X[:, k * mini_batch_size:(k+1)*mini_batch_size] mini_batch_Y = shuffled_Y[:, k * mini_batch_size:(k+1)*mini_batch_size] mini_batch = (mini_batch_X, mini_batch_Y) mini_batches.append(mini_batch) if m % mini_batch_size != 0: mini_batch_X[:,mini_batch_size*num_complete_minibatches:] mini_batch_Y[:,mini_batch_size*num_complete_minibatches:]mini_batch = (mini_batch_X, mini_batch_Y) mini_batches.append(mini_batch) return mini_batchesdef initialize_velocity(parameters): L = len(parameters) // 2 v = { } for l in range(L): v["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)]) v["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)]) return vdef update_parameters_with_momentum(parameters, grads, v, beta, learning_rate): update_parameters_with_momentum L = len(parameters) // 2 for l in range(L): update_parameters_with_momentum v["dW" + str(l+1)] = beta * v["dW" + str(l+1)] + (1 - beta) * grads["dW" + str(l+1)] v["db" + str(l+1)] = beta * v["db" + str(l+1)] + (1 - beta) * grads["db"+ str(l+1)]parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*v["dW" + str(l+1)] parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*v["db" + str(l+1)] return parameters,vdef initialize_adam(parameters): L = len(parameters) //2 v = { } s = { } for l in range(L): v["dW" + str(l+1)] = np.zeros_like(parameters["W" + str(l+1)]) v["db" + str(l+1)] = np.zeros_like(parameters["b" + str(l+1)]) s["dW" + str(l+1)] = np.zeros_like(parameters["W" + str(l+1)]) s["db" + str(l+1)] = np.zeros_like(parameters["b" + str(l+1)]) return v,sdef update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01, beta1 = 0.9,beta2= 0.999, epsilon=1e-8): L = len(parameters) // 2 v_corrected = { } s_corrected = { } for l in range(L): v["dW" + str(l+1)] = beta1*v["dW" + str(l+1)] + (1-beta1) * grads["dW"+ str(l+1)] v["db" + str(l+1)] = beta1*v["db" + str(l+1)] + (1-beta1) * grads["db" + str(l+1)] v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1-np.power(beta1,t)) v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1-np.power(beta1,t))s["dW" + str(l+1)] = beta2*s["dW"+str(l+1)] +(1-beta2) * np.power(grads["dW" + str(l+1)],2) s["db" + str(l+1)] = beta2*s["db"+str(l+1)] +(1-beta2) * np.power(grads["db" + str(l+1)],2) s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1-np.power(beta2,t)) s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1-np.power(beta2,t)) assert(parameters["b1"].shape == (5,1))parameters["W" + str(l+1)] = parameters["W" +str(l+1)] - learning_rate*(v_corrected["dW"+str(l+1)]/np.sqrt(s_corrected["dW"+str(l+1)] + epsilon))parameters["b" + str(l+1)] = parameters["b" +str(l+1)] - learning_rate*(v_corrected["db"+str(l+1)]/np.sqrt(s_corrected["db"+str(l+1)] + epsilon)) assert(parameters["b1"].shape == (5,1)) return parameters, v, sdef model(X, Y, layer_dims, optimizer, learning_rate = 0.0007, mini_batch_size = 64, beta = 0.9,beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, num_epochs = 10000, print_cost = True, is_plot = True): L = len(layer_dims) costs = [] t = 0 seed = 10 parameters = opt_utils.initialize_parameters(layer_dims) if optimizer == "gd": pass elif optimizer == "momentum": v = initialize_velocity(parameters) elif optimizer == "adam": v,s = initialize_adam(parameters) assert(parameters["b1"].shape == (5,1)) else: print("Error") exit() assert(parameters["b1"].shape == (5,1)) for i in range(num_epochs):seed = seed + 1 assert(parameters["b1"].shape == (5,1)) mini_batches = random_mini_batches(X, Y, mini_batch_size, seed) assert(parameters["b1"].shape == (5,1))for mini_batch in mini_batches:assert(parameters["b1"].shape == (5,1)) (mini_batch_X, mini_batch_Y) = mini_batch assert(parameters["b1"].shape == (5,1)) AL, cache = opt_utils.forward_propagation(mini_batch_X, parameters) assert(parameters["b1"].shape == (5,1))cost = opt_utils.compute_cost(AL, mini_batch_Y)assert(parameters["b1"].shape == (5,1))grads = opt_utils.backward_propagation(mini_batch_X, mini_batch_Y, cache) assert(parameters["b1"].shape == (5,1)) if optimizer == "gd": parameters = update_parameters_with_gd(parameters, grads, learning_rate) elif optimizer == "momentum": parameters,v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate) elif optimizer == "adam": t = t + 1 parameters,v,s == update_parameters_with_adam(parameters,grads, v, s, t, learning_rate,beta1, beta2, epsilon) assert(parameters["b1"].shape == (5,1)) if i % 100 == 0: costs.append(cost) if print_cost and i % 1000 == 0: print("第 %s 次迭代,cost = %s:"%(str(i), str(cost))) if is_plot: plt.plot(costs) plt.xlabel("epochs(per 100)") plt.ylabel("cost") plt.title("learning_rate = " + str(learning_rate)) plt.show() return parameters

【deep|Andrew Ng, deeplearning. Course2 week2,Optimization】2、 unit_test
from optimize_algorithm import * from testCase import *def line(s): print("="*10 + s + "="*10) """ line("test for update_parameters_with_gd") parameters, grads, learning_rate = update_parameters_with_gd_test_case() parameters = update_parameters_with_gd(parameters, grads, learning_rate) print("W1 = " + str(parameters["W1"])) print("b1 = " + str(parameters["b1"])) print("W2 = " + str(parameters["W2"])) print("b2 = " + str(parameters["b2"]))line("test for random_mini_batches") X_asses, Y_asses, mini_batch_size = random_mini_batches_test_case() mini_batches = random_mini_batches(X_asses, Y_asses, mini_batch_size) print("第1个mini_batch_X 的维度为:",mini_batches[0][0].shape) print("第1个mini_batch_Y 的维度为:",mini_batches[0][1].shape) print("第2个mini_batch_X 的维度为:",mini_batches[1][0].shape) print("第2个mini_batch_Y 的维度为:",mini_batches[1][1].shape) print("第3个mini_batch_X 的维度为:",mini_batches[2][0].shape) print("第3个mini_batch_Y 的维度为:",mini_batches[2][1].shape)line("test for initialize_velocity") parameters = initialize_velocity_test_case() v = initialize_velocity(parameters) print('v["dW1"] = ' + str(v["dW1"])) print('v["db1"] = ' + str(v["db1"])) print('v["dW2"] = ' + str(v["dW2"])) print('v["db2"] = ' + str(v["db2"]))line("test for update_parameters_with_momentum") parameters, grads, v = update_parameters_with_momentum_test_case() parameters, v = update_parameters_with_momentum(parameters, grads, v, beta = 0.9, learning_rate= 0.01) print("W1 = " + str(parameters["W1"])) print("b1 = " + str(parameters["b1"])) print("W2 = " + str(parameters["W2"])) print("b2 = " + str(parameters["b2"])) print('v["dW1"] = ' + str(v["dW1"])) print('v["db1"] = ' + str(v["db1"])) print('v["dW2"] = ' + str(v["dW2"])) print('v["db2"] = ' + str(v["db2"])) print('g["dW1"] = ' + str(grads["dW1"])) print('g["db1"] = ' + str(grads["db1"])) print('g["dW2"] = ' + str(grads["dW2"])) print('g["db2"] = ' + str(grads["db2"]))line("test for initialize_adam") parameters = initialize_adam_test_case() v,s = initialize_adam(parameters) print('v["dW1"] = ' + str(v["dW1"])) print('v["db1"] = ' + str(v["db1"])) print('v["dW2"] = ' + str(v["dW2"])) print('v["db2"] = ' + str(v["db2"])) print('s["dW1"] = ' + str(s["dW1"])) print('s["db1"] = ' + str(s["db1"])) print('s["dW2"] = ' + str(s["dW2"])) print('s["db2"] = ' + str(s["db2"])) line("test for update_parameters_with_adam") parameters,grads, v,s = update_parameters_with_adam_test_case() parameters,v,s = update_parameters_with_adam(parameters,grads,v,s,t=2) print("W1 = " + str(parameters["W1"])) print("b1 = " + str(parameters["b1"])) print("W2 = " + str(parameters["W2"])) print("b2 = " + str(parameters["b2"])) print('v["dW1"] = ' + str(v["dW1"])) print('v["db1"] = ' + str(v["db1"])) print('v["dW2"] = ' + str(v["dW2"])) print('v["db2"] = ' + str(v["db2"])) print('s["dW1"] = ' + str(s["dW1"])) print('s["db1"] = ' + str(s["db1"])) print('s["dW2"] = ' + str(s["dW2"])) print('s["db2"] = ' + str(s["db2"])) train_X, train_Y = opt_utils.load_dataset(is_plot = True) layer_dims = [train_X.shape[0], 5, 2, 1] parameters = model(train_X, train_Y, layer_dims, optimizer = "adam", is_plot = True)prediction = opt_utils.predict(train_X, train_Y, parameters)plt.title("Model with gradient descent optimization") axes = plt.gca() axes.set_xlim([-1.5, 2.5]) axes.set_ylim([-1, 1.5]) opt_utils.plot_decision_boundary(lambda x :opt_utils.predict_dec(parameters, x.T), train_X, train_Y) """

    推荐阅读