公益AI-TASK06-批量归一化和残差网络；凸优化；梯度下降公益AI-TASK06-批量归一化和残差

学习目标
一、批量归一化和残差网络
二、凸优化
三、梯度下降
一、批量归一化和残差网络
我的理解：把一堆凌乱的数据集成比例的缩放、改造，生成一个均值mean为0方差std为1的操作，批量batch是对应喂数据是一批一批的。残差网络将神经网络链上的传参跳跃传导，貌似这样可以让神经网络可深可浅。
对输入的标准化（浅层模型）
处理后的任意一个特征在数据集中所有样本上的均值为0、标准差为1。
标准化处理输入数据使各个特征的分布相近
批量归一化（深度模型）
利用小批量上的均值和标准差，不断调整神经网络中间输出，从而使整个神经网络在各层的中间输出的数值更稳定。
1.对全连接层做批量归一化
位置：全连接层中的仿射变换和激活函数之间。
全连接：

文章图片
image.png

文章图片
image.png
2.对卷积层做批量归一化
位置：卷积计算之后、应?激活函数之前。
如果卷积计算输出多个通道，我们需要对这些通道的输出分别做批量归一化，且每个通道都拥有独立的拉伸和偏移参数。计算：对单通道，batchsize=m,卷积计算输出=pxq 对该通道中m×p×q个元素同时做批量归一化,使用相同的均值和方差。
3.预测时的批量归一化
训练：以batch为单位,对每个batch计算均值和方差。
预测：用移动平均估算整个训练数据集的样本均值和方差。
【代码】

# 从零实现 import time import torch from torch import nn, optim import torch.nn.functional as F import torchvision import sys sys.path.append("/home/kesci/input/") import d2lzh1981 as d2l device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum): # 判断当前模式是训练模式还是预测模式 if not is_training: # 如果是在预测模式下，直接使用传入的移动平均所得的均值和方差 X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps) else: assert len(X.shape) in (2, 4) if len(X.shape) == 2: # 使用全连接层的情况，计算特征维上的均值和方差 mean = X.mean(dim=0) var = ((X - mean) ** 2).mean(dim=0) else: # 使用二维卷积层的情况，计算通道维上（axis=1）的均值和方差。这里我们需要保持 # X的形状以便后面可以做广播运算 mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True) var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True) # 训练模式下用当前的均值和方差做标准化 X_hat = (X - mean) / torch.sqrt(var + eps) # 更新移动平均的均值和方差 moving_mean = momentum * moving_mean + (1.0 - momentum) * mean moving_var = momentum * moving_var + (1.0 - momentum) * var Y = gamma * X_hat + beta# 拉伸和偏移 return Y, moving_mean, moving_varclass BatchNorm(nn.Module): def __init__(self, num_features, num_dims): super(BatchNorm, self).__init__() if num_dims == 2: shape = (1, num_features) #全连接层输出神经元 else: shape = (1, num_features, 1, 1)#通道数 # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成0和1 self.gamma = nn.Parameter(torch.ones(shape)) self.beta = nn.Parameter(torch.zeros(shape)) # 不参与求梯度和迭代的变量，全在内存上初始化成0 self.moving_mean = torch.zeros(shape) self.moving_var = torch.zeros(shape)def forward(self, X): # 如果X不在内存上，将moving_mean和moving_var复制到X所在显存上 if self.moving_mean.device != X.device: self.moving_mean = self.moving_mean.to(X.device) self.moving_var = self.moving_var.to(X.device) # 保存更新过的moving_mean和moving_var, Module实例的traning属性默认为true, 调用.eval()后设成false Y, self.moving_mean, self.moving_var = batch_norm(self.training, X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9) return Y# 基于LeNet的应用 net = nn.Sequential( nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size BatchNorm(6, num_dims=4), nn.Sigmoid(), nn.MaxPool2d(2, 2), # kernel_size, stride nn.Conv2d(6, 16, 5), BatchNorm(16, num_dims=4), nn.Sigmoid(), nn.MaxPool2d(2, 2), d2l.FlattenLayer(), nn.Linear(16*4*4, 120), BatchNorm(120, num_dims=2), nn.Sigmoid(), nn.Linear(120, 84), BatchNorm(84, num_dims=2), nn.Sigmoid(), nn.Linear(84, 10) ) print(net)#batch_size = 256 ##cpu要调小batchsize batch_size=16def load_data_fashion_mnist(batch_size, resize=None, root='/home/kesci/input/FashionMNIST2065'): """Download the fashion mnist dataset and then load into memory.""" trans = [] if resize: trans.append(torchvision.transforms.Resize(size=resize)) trans.append(torchvision.transforms.ToTensor())transform = torchvision.transforms.Compose(trans) mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform) mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=2) test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=2)return train_iter, test_iter train_iter, test_iter = load_data_fashion_mnist(batch_size)lr, num_epochs = 0.001, 5 optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)# 简洁实现（直接用nn包里的BatchNorm~） net = nn.Sequential( nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size nn.BatchNorm2d(6), nn.Sigmoid(), nn.MaxPool2d(2, 2), # kernel_size, stride nn.Conv2d(6, 16, 5), nn.BatchNorm2d(16), nn.Sigmoid(), nn.MaxPool2d(2, 2), d2l.FlattenLayer(), nn.Linear(16*4*4, 120), nn.BatchNorm1d(120), nn.Sigmoid(), nn.Linear(120, 84), nn.BatchNorm1d(84), nn.Sigmoid(), nn.Linear(84, 10) )optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

残差网络
深度学习的问题：深度CNN网络达到一定深度后再一味地增加层数并不能带来进一步地分类性能提高，反而会招致网络收敛变得更慢，准确率也变得更差。
残差块 Residual Block
恒等映射：
左边：f(x)=x
右边：f(x)-x=0 （易于捕捉恒等映射的细微波动

文章图片
image.png
在残差块中，输?可通过跨层的数据线路更快地向前传播。

class Residual(nn.Module):# 本类已保存在d2lzh_pytorch包中方便以后使用 #可以设定输出通道数、是否使用额外的1x1卷积层来修改通道数以及卷积层的步幅。 def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1): super(Residual, self).__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) if use_1x1conv: self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride) else: self.conv3 = None self.bn1 = nn.BatchNorm2d(out_channels) self.bn2 = nn.BatchNorm2d(out_channels)def forward(self, X): Y = F.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) return F.relu(Y + X)blk = Residual(3, 3) X = torch.rand((4, 3, 6, 6)) blk(X).shape # torch.Size([4, 3, 6, 6])blk = Residual(3, 6, use_1x1conv=True, stride=2) blk(X).shape # torch.Size([4, 6, 3, 3])# ResNet模型 # 卷积(64,7x7,3) # 批量一体化 # 最大池化(3x3,2) # 残差块x4 (通过步幅为2的残差块在每个模块之间减小高和宽) #全局平均池化 #全连接net = nn.Sequential( nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1))def resnet_block(in_channels, out_channels, num_residuals, first_block=False): if first_block: assert in_channels == out_channels # 第一个模块的通道数同输入通道数一致 blk = [] for i in range(num_residuals): if i == 0 and not first_block: blk.append(Residual(in_channels, out_channels, use_1x1conv=True, stride=2)) else: blk.append(Residual(out_channels, out_channels)) return nn.Sequential(*blk)net.add_module("resnet_block1", resnet_block(64, 64, 2, first_block=True)) net.add_module("resnet_block2", resnet_block(64, 128, 2)) net.add_module("resnet_block3", resnet_block(128, 256, 2)) net.add_module("resnet_block4", resnet_block(256, 512, 2))net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的输出: (Batch, 512, 1, 1) net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(512, 10))) X = torch.rand((1, 1, 224, 224)) for name, layer in net.named_children(): X = layer(X) print(name, ' output shape:\t', X.shape)lr, num_epochs = 0.001, 5 optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

稠密连接网络（DenseNet）

文章图片
image.png 主要构建模块
稠密块（dense block）：定义了输入和输出是如何连结的。
过渡层（transition layer）：用来控制通道数，使之不过大。
稠密块代码

def conv_block(in_channels, out_channels): blk = nn.Sequential(nn.BatchNorm2d(in_channels), nn.ReLU(), nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)) return blkclass DenseBlock(nn.Module): def __init__(self, num_convs, in_channels, out_channels): super(DenseBlock, self).__init__() net = [] for i in range(num_convs): in_c = in_channels + i * out_channels net.append(conv_block(in_c, out_channels)) self.net = nn.ModuleList(net) self.out_channels = in_channels + num_convs * out_channels # 计算输出通道数def forward(self, X): for blk in self.net: Y = blk(X) X = torch.cat((X, Y), dim=1)# 在通道维上将输入和输出连结 return X blk = DenseBlock(2, 3, 10) X = torch.rand(4, 3, 8, 8) Y = blk(X) Y.shape # torch.Size([4, 23, 8, 8])# 过渡层 #1×1 卷积层：来减小通道数 # 步幅为2的平均池化层：减半高和宽 def transition_block(in_channels, out_channels): blk = nn.Sequential( nn.BatchNorm2d(in_channels), nn.ReLU(), nn.Conv2d(in_channels, out_channels, kernel_size=1), nn.AvgPool2d(kernel_size=2, stride=2)) return blkblk = transition_block(23, 10) blk(Y).shape # torch.Size([4, 10, 4, 4])# DenseNet模型net = nn.Sequential( nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1))num_channels, growth_rate = 64, 32# num_channels为当前的通道数 num_convs_in_dense_blocks = [4, 4, 4, 4]for i, num_convs in enumerate(num_convs_in_dense_blocks): DB = DenseBlock(num_convs, num_channels, growth_rate) net.add_module("DenseBlosk_%d" % i, DB) # 上一个稠密块的输出通道数 num_channels = DB.out_channels # 在稠密块之间加入通道数减半的过渡层 if i != len(num_convs_in_dense_blocks) - 1: net.add_module("transition_block_%d" % i, transition_block(num_channels, num_channels // 2)) num_channels = num_channels // 2net.add_module("BN", nn.BatchNorm2d(num_channels)) net.add_module("relu", nn.ReLU()) net.add_module("global_avg_pool", d2l.GlobalAvgPool2d()) # GlobalAvgPool2d的输出: (Batch, num_channels, 1, 1) net.add_module("fc", nn.Sequential(d2l.FlattenLayer(), nn.Linear(num_channels, 10))) X = torch.rand((1, 1, 96, 96)) for name, layer in net.named_children(): X = layer(X) print(name, ' output shape:\t', X.shape)#batch_size = 256 batch_size=16 # 如出现“out of memory”的报错信息，可减小batch_size或resize train_iter, test_iter =load_data_fashion_mnist(batch_size, resize=96) lr, num_epochs = 0.001, 5 optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

二、凸优化
优化与深度学习
优化与估计
尽管优化方法可以最小化深度学习中的损失函数值，但本质上优化方法达到的目标与深度学习的目标并不相同。
优化方法目标：训练集损失函数值
深度学习目标：测试集损失函数值（泛化性）

%matplotlib inline import sys sys.path.append('/home/kesci/input') import d2lzh1981 as d2l from mpl_toolkits import mplot3d # 三维画图 import numpy as npdef f(x): return x * np.cos(np.pi * x) def g(x): return f(x) + 0.2 * np.cos(5 * np.pi * x)d2l.set_figsize((5, 3)) x = np.arange(0.5, 1.5, 0.01) fig_f, = d2l.plt.plot(x, f(x),label="train error") fig_g, = d2l.plt.plot(x, g(x),'--', c='purple', label="test error") fig_f.axes.annotate('empirical risk', (1.0, -1.2), (0.5, -1.1),arrowprops=dict(arrowstyle='->')) fig_g.axes.annotate('expected risk', (1.1, -1.05), (0.95, -0.5),arrowprops=dict(arrowstyle='->')) d2l.plt.xlabel('x') d2l.plt.ylabel('risk') d2l.plt.legend(loc="upper right")

文章图片
image.png 优化在深度学习中的挑战
①局部最小值
②鞍点
③梯度消失
局部最小值
(

def f(x): return x * np.cos(np.pi * x)d2l.set_figsize((4.5, 2.5)) x = np.arange(-1.0, 2.0, 0.1) fig,= d2l.plt.plot(x, f(x)) fig.axes.annotate('local minimum', xy=(-0.3, -0.25), xytext=(-0.77, -1.0), arrowprops=dict(arrowstyle='->')) fig.axes.annotate('global minimum', xy=(1.1, -0.95), xytext=(0.6, 0.8), arrowprops=dict(arrowstyle='->')) d2l.plt.xlabel('x') d2l.plt.ylabel('f(x)');

文章图片
image.png 鞍点

x = np.arange(-2.0, 2.0, 0.1) fig, = d2l.plt.plot(x, x**3) fig.axes.annotate('saddle point', xy=(0, -0.2), xytext=(-0.52, -5.0), arrowprops=dict(arrowstyle='->')) d2l.plt.xlabel('x') d2l.plt.ylabel('f(x)');

文章图片
image.png

x, y = np.mgrid[-1: 1: 31j, -1: 1: 31j] z = x**2 - y**2d2l.set_figsize((6, 4)) ax = d2l.plt.figure().add_subplot(111, projection='3d') ax.plot_wireframe(x, y, z, **{'rstride': 2, 'cstride': 2}) ax.plot([0], [0], [0], 'ro', markersize=10) ticks = [-1,0, 1] d2l.plt.xticks(ticks) d2l.plt.yticks(ticks) ax.set_zticks(ticks) d2l.plt.xlabel('x') d2l.plt.ylabel('y');

文章图片
image.png 梯度消失

x = np.arange(-2.0, 5.0, 0.01) fig, = d2l.plt.plot(x, np.tanh(x)) d2l.plt.xlabel('x') d2l.plt.ylabel('f(x)') fig.axes.annotate('vanishing gradient', (4, 1), (2, 0.0) ,arrowprops=dict(arrowstyle='->'))

文章图片
image.png 凸性
基础
集合

文章图片
image.png 函数

文章图片
image.png

def f(x): return 0.5 * x**2# Convexdef g(x): return np.cos(np.pi * x)# Nonconvexdef h(x): return np.exp(0.5 * x)# Convexx, segment = np.arange(-2, 2, 0.01), np.array([-1.5, 1]) d2l.use_svg_display() _, axes = d2l.plt.subplots(1, 3, figsize=(9, 3))for ax, func in zip(axes, [f, g, h]): ax.plot(x, func(x)) ax.plot(segment, func(segment),'--', color="purple") # d2l.plt.plot([x, segment], [func(x), func(segment)], axes=ax)

文章图片
image.png Jensen 不等式

文章图片
image.png 性质
①无局部极小值
②与凸集的关系
③二阶条件
【公益AI-TASK06-批量归一化和残差网络；凸优化；梯度下降】无局部最小值

文章图片
image.png 与凸集的关系

文章图片
image.png

x, y = np.meshgrid(np.linspace(-1, 1, 101), np.linspace(-1, 1, 101), indexing='ij')z = x**2 + 0.5 * np.cos(2 * np.pi * y)# Plot the 3D surface d2l.set_figsize((6, 4)) ax = d2l.plt.figure().add_subplot(111, projection='3d') ax.plot_wireframe(x, y, z, **{'rstride': 10, 'cstride': 10}) ax.contour(x, y, z, offset=-1) ax.set_zlim(-1, 1.5)# Adjust labels for func in [d2l.plt.xticks, d2l.plt.yticks, ax.set_zticks]: func([-1, 0, 1])

文章图片
image.png 凸函数与二阶导数

文章图片
image.png

def f(x): return 0.5 * x**2x = np.arange(-2, 2, 0.01) axb, ab = np.array([-1.5, -0.5, 1]), np.array([-1.5, 1])d2l.set_figsize((3.5, 2.5)) fig_x, = d2l.plt.plot(x, f(x)) fig_axb, = d2l.plt.plot(axb, f(axb), '-.',color="purple") fig_ab, = d2l.plt.plot(ab, f(ab),'g-.')fig_x.axes.annotate('a', (-1.5, f(-1.5)), (-1.5, 1.5),arrowprops=dict(arrowstyle='->')) fig_x.axes.annotate('b', (1, f(1)), (1, 1.5),arrowprops=dict(arrowstyle='->')) fig_x.axes.annotate('x', (-0.5, f(-0.5)), (-1.5, f(-0.5)),arrowprops=dict(arrowstyle='->'))

文章图片
image.png

文章图片
image.png 三、梯度下降

%matplotlib inline import numpy as np import torch import time from torch import nn, optim import math import sys sys.path.append('/home/kesci/input') import d2lzh1981 as d2l

一维梯度下降
证明：沿梯度反方向移动自变量可以减小函数值

文章图片
image.png

def f(x): return x**2# Objective functiondef gradf(x): return 2 * x# Its derivativedef gd(eta): x = 10 results = [x] for i in range(10): x -= eta * gradf(x) results.append(x) print('epoch 10, x:', x) return resultsres = gd(0.2)def show_trace(res): n = max(abs(min(res)), abs(max(res))) f_line = np.arange(-n, n, 0.01) d2l.set_figsize((3.5, 2.5)) d2l.plt.plot(f_line, [f(x) for x in f_line],'-') d2l.plt.plot(res, [f(x) for x in res],'-o') d2l.plt.xlabel('x') d2l.plt.ylabel('f(x)')show_trace(res)

文章图片
image.png 学习率

show_trace(gd(0.05))

文章图片
image.png

show_trace(gd(1.1))

文章图片
image.png 局部极小值
(

c = 0.15 * np.pidef f(x): return x * np.cos(c * x)def gradf(x): return np.cos(c * x) - c * x * np.sin(c * x)show_trace(gd(2))

文章图片
image.png 多维梯度下降法

文章图片
image.png

def train_2d(trainer, steps=20): x1, x2 = -5, -2 results = [(x1, x2)] for i in range(steps): x1, x2 = trainer(x1, x2) results.append((x1, x2)) print('epoch %d, x1 %f, x2 %f' % (i + 1, x1, x2)) return resultsdef show_trace_2d(f, results): d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e') x1, x2 = np.meshgrid(np.arange(-5.5, 1.0, 0.1), np.arange(-3.0, 1.0, 0.1)) d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4') d2l.plt.xlabel('x1') d2l.plt.ylabel('x2')

fx) = x^2_1 + 2x^2_2

eta = 0.1def f_2d(x1, x2):# 目标函数 return x1 ** 2 + 2 * x2 ** 2def gd_2d(x1, x2): return (x1 - eta * 2 * x1, x2 - eta * 4 * x2)show_trace_2d(f_2d, train_2d(gd_2d))

文章图片
image.png 自适应方法
牛顿法

文章图片
image.png

c = 0.5def f(x): return np.cosh(c * x)# Objectivedef gradf(x): return c * np.sinh(c * x)# Derivativedef hessf(x): return c**2 * np.cosh(c * x)# Hessian# Hide learning rate for now def newton(eta=1): x = 10 results = [x] for i in range(10): x -= eta * gradf(x) / hessf(x) results.append(x) print('epoch 10, x:', x) return resultsshow_trace(newton())

文章图片
image.png

c = 0.15 * np.pidef f(x): return x * np.cos(c * x)def gradf(x): return np.cos(c * x) - c * x * np.sin(c * x)def hessf(x): return - 2 * c * np.sin(c * x) - x * c**2 * np.cos(c * x)show_trace(newton())

文章图片
image.png

show_trace(newton(0.5))

文章图片
image.png 收敛性分析

文章图片
image.png 预处理（Heissan阵辅助梯度下降）
diag(H_f)^{-1}\nabla\mathbf{x}$
梯度下降与线性搜索（共轭梯度法）
随机梯度下降
随机梯度下降参数更新

文章图片
image.png

def f(x1, x2): return x1 ** 2 + 2 * x2 ** 2# Objectivedef gradf(x1, x2): return (2 * x1, 4 * x2)# Gradientdef sgd(x1, x2):# Simulate noisy gradient global lr# Learning rate scheduler (g1, g2) = gradf(x1, x2)# Compute gradient (g1, g2) = (g1 + np.random.normal(0.1), g2 + np.random.normal(0.1)) eta_t = eta * lr()# Learning rate at time t return (x1 - eta_t * g1, x2 - eta_t * g2)# Update variableseta = 0.1 lr = (lambda: 1)# Constant learning rate show_trace_2d(f, train_2d(sgd, steps=50))

文章图片
image.png

文章图片
image.png 动态学习率

文章图片
image.png

def exponential(): global ctr ctr += 1 return math.exp(-0.1 * ctr)ctr = 1 lr = exponential# Set up learning rate show_trace_2d(f, train_2d(sgd, steps=1000))

文章图片
image.png

def polynomial(): global ctr ctr += 1 return (1 + 0.1 * ctr)**(-0.5)ctr = 1 lr = polynomial# Set up learning rate show_trace_2d(f, train_2d(sgd, steps=50))

文章图片
image.png 小批量随机梯度下降

# 读取数据 def get_data_ch7():# 本函数已保存在d2lzh_pytorch包中方便以后使用 data = https://www.it610.com/article/np.genfromtxt('/home/kesci/input/airfoil4755/airfoil_self_noise.dat', delimiter='\t') data = https://www.it610.com/article/(data - data.mean(axis=0)) / data.std(axis=0) # 标准化 return torch.tensor(data[:1500, :-1], dtype=torch.float32), / torch.tensor(data[:1500, -1], dtype=torch.float32) # 前1500个样本(每个样本5个特征)features, labels = get_data_ch7() features.shapeimport pandas as pd df = pd.read_csv('/home/kesci/input/airfoil4755/airfoil_self_noise.dat', delimiter='\t', header=None) df.head(10)# 从零开始实现 def sgd(params, states, hyperparams): for p in params: p.data -= hyperparams['lr'] * p.grad.data# 本函数已保存在d2lzh_pytorch包中方便以后使用 def train_ch7(optimizer_fn, states, hyperparams, features, labels, batch_size=10, num_epochs=2): # 初始化模型 net, loss = d2l.linreg, d2l.squared_lossw = torch.nn.Parameter(torch.tensor(np.random.normal(0, 0.01, size=(features.shape[1], 1)), dtype=torch.float32), requires_grad=True) b = torch.nn.Parameter(torch.zeros(1, dtype=torch.float32), requires_grad=True)def eval_loss(): return loss(net(features, w, b), labels).mean().item()ls = [eval_loss()] data_iter = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True)for _ in range(num_epochs): start = time.time() for batch_i, (X, y) in enumerate(data_iter): l = loss(net(X, w, b), y).mean()# 使用平均损失# 梯度清零 if w.grad is not None: w.grad.data.zero_() b.grad.data.zero_()l.backward() optimizer_fn([w, b], states, hyperparams)# 迭代模型参数 if (batch_i + 1) * batch_size % 100 == 0: ls.append(eval_loss())# 每100个样本记录下当前训练误差 # 打印结果和作图 print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start)) d2l.set_figsize() d2l.plt.plot(np.linspace(0, num_epochs, len(ls)), ls) d2l.plt.xlabel('epoch') d2l.plt.ylabel('loss')def train_sgd(lr, batch_size, num_epochs=2): train_ch7(sgd, None, {'lr': lr}, features, labels, batch_size, num_epochs)train_sgd(1, 1500, 6) train_sgd(0.005, 1) train_sgd(0.05, 10)

# 简洁实现 # 本函数与原书不同的是这里第一个参数优化器函数而不是优化器的名字 # 例如: optimizer_fn=torch.optim.SGD, optimizer_hyperparams={"lr": 0.05} def train_pytorch_ch7(optimizer_fn, optimizer_hyperparams, features, labels, batch_size=10, num_epochs=2): # 初始化模型 net = nn.Sequential( nn.Linear(features.shape[-1], 1) ) loss = nn.MSELoss() optimizer = optimizer_fn(net.parameters(), **optimizer_hyperparams)def eval_loss(): return loss(net(features).view(-1), labels).item() / 2ls = [eval_loss()] data_iter = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True)for _ in range(num_epochs): start = time.time() for batch_i, (X, y) in enumerate(data_iter): # 除以2是为了和train_ch7保持一致, 因为squared_loss中除了2 l = loss(net(X).view(-1), y) / 2 optimizer.zero_grad() l.backward() optimizer.step() if (batch_i + 1) * batch_size % 100 == 0: ls.append(eval_loss()) # 打印结果和作图 print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start)) d2l.set_figsize() d2l.plt.plot(np.linspace(0, num_epochs, len(ls)), ls) d2l.plt.xlabel('epoch') d2l.plt.ylabel('loss')train_pytorch_ch7(optim.SGD, {"lr": 0.05}, features, labels, 10)