Python|用反向传播学习识别mnist手写数字(mini-batch版)

NN的学习,用了BP高效求梯度
用了mini-batch一次处理一个batch的数据,加快计算

# BP_Study.py # 反向传播学习,mnist手写数字分类 # 2层网络import numpy as np import time from dataset.mnist import load_mnist from TwoLayerNet import TwoLayerNet import matplotlib.pyplot as pltstart = time.clock()# 读入数据 (x_train, t_train), (x_test, t_test) = \ load_mnist(normalize=True, one_hot_label=True)net = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)iter_num = 10000 # 每个epoch跑max(1, train_size / batch_size)次,这里是600次 # 即每个epoch随机选600个batch, 一个epoch相当于把整个训练数据集遍历一遍 # 共迭代10000次,所以会有50/3=16个epoch,相当于把训练数据集跑了16遍 learning_rate = 0.1 train_size = x_train.shape[0] batch_size = 100 train_loss_list = [] train_acc_list = [] test_acc_list = []iter_per_epoch = max(1, train_size / batch_size) # 每个epoch跑max(1, train_size / batch_size)次,这里是600次 # 即每个epoch随机选600个batch, 一个epoch相当于把整个训练数据集遍历一遍 # 共迭代10000次,所以会有50/3=16个epoch,相当于把训练数据集跑了16遍for i in range(iter_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] t_batch = t_train[batch_mask]# 反向传播求梯度 grad = net.gradient(x_batch, t_batch)# 更新参数 for key in ('w1', 'b1', 'w2', 'b2'): net.params[key] -= learning_rate * grad[key]loss = net.loss(x_batch, t_batch) train_loss_list.append(loss) # print('loss:' + str(loss))if i % iter_per_epoch == 0: # 每个epoch计算一次精度,所以总共只计算16次 train_acc = net.accuracy(x_train, t_train) test_acc = net.accuracy(x_test, t_test) train_acc_list.append(train_acc) test_acc_list.append(test_acc) print('train_acc,test_acc |' + str(train_acc) + ',' + str(test_acc))# 画损失函数的变化 x1 = np.arange(len(train_loss_list)) ax1 = plt.subplot(211) plt.plot(x1, train_loss_list) plt.xlabel("iteration") plt.ylabel("loss")# 画训练精度,测试精度随着epoch的变化 markers = {'train': 'o', 'test': 's'} x2 = np.arange(len(train_acc_list)) ax2 = plt.subplot(212) plt.plot(x2, train_acc_list, label='train acc') plt.plot(x2, test_acc_list, label='test acc', linestyle='--') plt.xlabel("epochs") plt.ylabel("accuracy") plt.ylim(0, 1.0) plt.legend(loc='lower right') plt.show()end = time.clock() print('Running Time: %s Seconds' %(end - start))

train_acc,test_acc |0.08986666666666666,0.0939 train_acc,test_acc |0.8999166666666667,0.9023 train_acc,test_acc |0.9235166666666667,0.9266 train_acc,test_acc |0.9342833333333334,0.9339 train_acc,test_acc |0.9443333333333334,0.9433 train_acc,test_acc |0.94995,0.9489 train_acc,test_acc |0.9556,0.9524 train_acc,test_acc |0.95845,0.9534 train_acc,test_acc |0.9624666666666667,0.9559 train_acc,test_acc |0.9648166666666667,0.959 train_acc,test_acc |0.96855,0.9621 train_acc,test_acc |0.96985,0.9632 train_acc,test_acc |0.9728333333333333,0.9655 train_acc,test_acc |0.9748166666666667,0.9679 train_acc,test_acc |0.9748166666666667,0.9657 train_acc,test_acc |0.9781166666666666,0.9691 train_acc,test_acc |0.9785,0.9687 Running Time: 91.28648850219648 Seconds

【Python|用反向传播学习识别mnist手写数字(mini-batch版)】Python|用反向传播学习识别mnist手写数字(mini-batch版)
文章图片

可以看到损失很快下去了,而训练精度和测试精度一直都在上升,说明 NN在有效学习
# BackPropagation.py # relu层,sigmoid,Affine层,softmaxwithloss层的类 import numpy as npclass Relu: def __init__(self): self.mask = None# 前向传播的计算 def forward(self, x): self.mask = (x <= 0) out = x.copy()# out就等于x out[self.mask] = 0return out# 反向传播的计算 def backward(self, dout): dout[self.mask] = 0 dx = doutreturn dxclass Sigmoid: def __init__(self): self.out = Nonedef forward(self, x): out = 1 / 1 + np.exp(-x) self.out = outreturn outdef backward(self, dout): dx = dout * self.out * (1 - self.out)return dxclass Affine: def __init__(self, w, b): self.w = w self.b = b self.x = None self.dw = None self.db = Nonedef forward(self, x): self.x = x out = np.dot(self.x, self.w) + self.b # 上面表达式使用了numpy数组的广播功能 # affine1层的np.dot(self.x, self.w)是(100,50) # 而self.b是(50,)的行向量 # 则np.dot(self.x, self.w)的每一行都要加上self.breturn outdef backward(self, dout): dx = np.dot(dout, self.w.T) # 权重经过的是乘法器单元,对数据x求导则让输出dout乘以权重 # 对权重求导则让dout乘以数据x # 偏置经过加法器单元,对b求导就等于对dout求导 self.dw = np.dot(self.x.T, dout) self.db = np.sum(dout, axis=0)return dx''' # 就是这个s只支持输入是一维向量的oftmax函数,害我调了俩小时bug``` def softmax(a): c = np.max(a) exp_a = np.exp(a-c)# 防溢出 sum_exp_a = np.sum(exp_a) y = exp_a / sum_exp_areturn y'''# 支持输入二维数据的softmax,即mini-batch批量输入 def softmax(x): if x.ndim == 2: x = x.T x = x - np.max(x, axis=0) y = np.exp(x) / np.sum(np.exp(x), axis=0) return y.Tx = x - np.max(x) # 溢出对策 return np.exp(x) / np.sum(np.exp(x))# 支持mini-batch批量输入的交叉熵误差 def cross_entropy_error(y, t): if y.ndim == 1: # 如果y是一维数组,即不是批处理(mini-batch)输入,而是单条数据输入 # 则确认把t,y转变为行向量 t = t.reshape(1, t.size) y = y.reshape(1, y.size)# 监督数据是one-hot-vector的情况下,转换为正确解标签的索引 if t.size == y.size: t = t.argmax(axis=1)# 得到每行最大值的数的索引,t由(100, 10)变为(100,)batch_size = y.shape[0]temp = -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size # temp是batch_size个输入数据的损失函数值的总和return tempdef mean_squared_error(y, t): return 0.5 * np.sum((y-t)**2)class SoftmaxWithLoss: def __init__(self): self.loss = None self.y = None self.t = None# one-hot vectordef forward(self, x, t): self.t = t self.y = softmax(x) self.loss = cross_entropy_error(self.y, self.t)return self.lossdef backward(self, dout=1): batch_size = self.t.shape[0] dy = (self.y - self.t) / batch_sizereturn dy# 所有层的backward()函数的输入参数都只有从后部送来的梯度,依次反着往前传 # 最尾部开始的梯度是1,所以softmaxwithloss层的输入参数是dout=1

由于反向传播时使用了mini-batch, 所以输入不再是一维向量,每个激活函数的输入要考虑到一次输入多条数据的矩阵输入/批输入情况,否则计算会出错又很难找到错误根源。
# TwoLayerNet.py # 2层网络,1个隐层 # 反向传播法求梯度import numpy as np from collections import OrderedDict # 有序字典,NN的层必须保存为有序字典变量以实现前向反向的依序处理 from BackPropagation import * # 导入定义affine,relu,softmaxwithloss层的类# 数值梯度的实现,类内的数值梯度方法需要调用这个方法 ''' # 这个方法只适用于输入x是一维向量的 不适用于输入是多维的,如权重矩阵,所以NN中不常用 def numerical_gradient(f, x): h = 1e-3 grad = np.zeros_like(x) # 生成和x形状一样的数组,元素初始化为0for idx in range(x.size): tmp_val = x[idx] # 计算f(x + h) # 梯度是所有偏导数构成的向量,求一个变量的偏导只能让这个变量加上微小变化h # 其他变量不能加,所以要用for loop x[idx] = tmp_val + h fxh1 = f(x)# 计算f(x - h) x[idx] = tmp_val - h fxh2 = f(x)grad[idx] = (fxh1 - fxh2) / (2*h) x[idx] = tmp_val # 还原值 return grad '''# 这是可以接受输入是矩阵的数值梯度计算函数 # 利用numpy的nditer对象实现多维索引 def numerical_gradient(f, x): h = 1e-4# 0.0001 grad = np.zeros_like(x)it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: idx = it.multi_index tmp_val = x[idx] x[idx] = float(tmp_val) + h fxh1 = f(x)# f(x+h)x[idx] = tmp_val - h fxh2 = f(x)# f(x-h) grad[idx] = (fxh1 - fxh2) / (2 * h)x[idx] = tmp_val# 还原值 it.iternext()return gradclass TwoLayerNet:def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01): self.params = {} self.params['w1'] = weight_init_std * \ np.random.randn(input_size, hidden_size) # 权重初始化为高斯分布 self.params['b1'] = np.zeros(hidden_size)# 偏置初始化为0 self.params['w2'] = weight_init_std * \ np.random.randn(hidden_size, output_size) self.params['b2'] = np.zeros(output_size)# 生成层,用层进行模块化地实现NN非常便利 # 可以像组装乐高积木一样组装任意层数的NN self.layers = OrderedDict()# 有序字典 self.layers['Affine1'] = \ Affine(self.params['w1'], self.params['b1']) self.layers['Relu1'] = Relu() self.layers['Affine2'] = \ Affine(self.params['w2'], self.params['b2']) self.lastlayer = SoftmaxWithLoss()def predict(self, x): for layer in self.layers.values(): # 有序字典变量共3个键值对,affine1, relu1, affine2 x = layer.forward(x)# x是输出层的affine2的输出,未经过softmax和损失计算 return xdef loss(self, x, t): y = self.predict(x) # y是输出层的affine2的输出,未经过softmax和损失计算 return self.lastlayer.forward(y, t)def accuracy(self, x, t): y = self.predict(x) y = np.argmax(y, axis=1) if t.ndim != 1: t = np.argmax(t, axis=1)accuracy = np.sum( y==t ) / float(x.shape[0]) return accuracydef gradient(self, x, t): # 基于反向传播的解析求梯度 # forward self.loss(x, t) # 到此损失计算结束,则前向的一次运算完成,开始反向求梯度# backward dout = 1 dout = self.lastlayer.backward(dout) # 先经过softmaxwithloss层的反向梯度计算layers = list(self.layers.values()) layers.reverse()# 列表反序 for layer in layers: # 依次经过affIne2,relu1,affine1的反向梯度计算 dout = layer.backward(dout)grads = {} grads['w1'] = self.layers['Affine1'].dw grads['b1'] = self.layers['Affine1'].db grads['w2'] = self.layers['Affine2'].dw grads['b2'] = self.layers['Affine2'].dbreturn gradsdef numerical_gradient(self, x, t): loss_w = lambda w: self.loss(x, t)grads = {}grads['w1'] = numerical_gradient(loss_w, self.params['w1']) grads['b1'] = numerical_gradient(loss_w, self.params['b1']) grads['w2'] = numerical_gradient(loss_w, self.params['w2']) grads['b2'] = numerical_gradient(loss_w, self.params['b2'])return grads

    推荐阅读