强化学习|强化学习-PPO算法实现pendulum

代码都是学习别人的,但我分享几点我踩过的大坑。
1.蒙特卡洛的V值
【强化学习|强化学习-PPO算法实现pendulum】书上给的例子,是一次取一条轨迹,v=r+gamma*v 依次计算状态价值,这几乎是全部用蒙特卡洛方法的计算状态价值,并且没有对各条轨迹取均值,我想这种方法是极其不好的
2.样本不是独立同分布
由于1.中的原因,取到的样本不是独立同分布,把这种样本放入训练,可能会大幅影响训练效果。
3.代码写的太繁复。
俗话说的好,宁简勿繁,把太多方法封装成函数,在前期是不太好的行为,非常不便于调试,应当全部删去。
4.神经网络极易输出[nan]
可能是因为用了torch.Tensor()来转化向量,double型向量这使得他的内存占用高,改为torch.FloatTensor()有明显改善。这一点极其重要,如果不用这个很可能根本没办法训练
强化学习|强化学习-PPO算法实现pendulum
文章图片

训练效果
代码如下

""" """ import torch.nn.functional as F import torchvision.models as models import retro import hiddenlayer as hl import torch # import retro import pandas as pd import numpy as np import gym import torch.nn as nn from torch.distributions import Normal class DQBReplayer: def __init__(self,capacity): self.memory = pd.DataFrame(index=range(capacity),columns=['observation','action','reward','next_observation','done','step']) self.i=0 self.count=0 self.capacity=capacity def store(self,*args):self.memory.loc[self.i]=args self.i=(self.i+1)%self.capacity self.count=min(self.count+1,self.capacity) def sample(self,size=32): indics=np.random.choice(self.count,size=size)return (np.stack(self.memory.loc[indics,field]) for field in self.memory.columns)#为什么#是第indics行和feild列 def clear(self): self.memory.drop(self.memory.index,inplace=True) self.count=0 self.i=0 # class PolicyNetwork(nn.Module): def __init__(self): super(PolicyNetwork, self).__init__() self.relu = nn.ReLU() self.fc1 = nn.Linear(3, 64) self.fc2 = nn.Linear(64, 256) self.fc_mu = nn.Linear(256, 1) self.fc_std = nn.Linear(256, 1) self.tanh = nn.Tanh() self.softplus = nn.Softplus()def forward(self, x): x = self.relu(self.fc1(x)) x = self.relu(self.fc2(x)) mu = 2 * self.tanh(self.fc_mu(x)) std = self.softplus(self.fc_std(x)) + 1e-3 return mu, stddef select_action(self, state):with torch.no_grad(): mu, std = self.forward(state) n = Normal(mu, std) action = n.sample() # print(" ac{:.1f},mu{},std{}".format( float(action),mu,std), end=" ") return np.clip(action.item(), -2., 2.)class ValueNetwork(nn.Module): def __init__(self): super(ValueNetwork, self).__init__() self.relu = nn.ReLU() self.fc1 = nn.Linear(3, 64) self.fc2 = nn.Linear(64, 256) self.fc3 = nn.Linear(256, 1)def forward(self, x): x = self.relu(self.fc1(x)) x = self.relu(self.fc2(x)) x = self.fc3(x) return xclass PPO(nn.Module): def __init__(self): super(PPO,self).__init__() self.replayer=DQBReplayer(capacity=1000) self.gamma=0.99 self.policy = PolicyNetwork().to(device) self.old_policy = PolicyNetwork().to(device) self.value = https://www.it610.com/article/ValueNetwork().to(device) self.learn_step=0 self.canvasl = hl.Canvas() self.history = hl.History()if __name__ =="__main__": device=torch.device("cuda" if torch.cuda.is_available() else"cpu") env=gym.make("Pendulum-v0").unwrappednet=PPO().to(device) optim = torch.optim.Adam(net.policy.parameters(), lr=0.001) value_optim= torch.optim.Adam(net.value.parameters(), lr=0.001)for i in range(200000): state = env.reset() epoch_reward=0#每局游戏的累计奖励 for step in range(200): # env.render() state_tensor = torch.FloatTensor(state).to(device) action=net.policy.select_action(state_tensor) next_state,r,done,info=env.step([action])reward = (r + 8.1) / 8.1 epoch_reward+=reward net.replayer.store(state, action, reward, next_state, done,step) net.learn_step += 1 state = next_statenet.old_policy.load_state_dict(net.policy.state_dict()) for K in range(10): sample_n = net.replayer.count states, actions, rewards, next_states, dones, steps = net.replayer.sample(32) states = torch.FloatTensor(states).to(device) next_states = torch.FloatTensor(next_states).to(device) actions = torch.FloatTensor(actions).unsqueeze(1).to(device) rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device) with torch.no_grad():# 为什么 old_mu, old_std = net.old_policy(states) old_n = Normal(old_mu, old_std)value_target = rewards + net.gamma * net.value(next_states) advantage = value_target - net.value(states)mu, std = net.policy(states) n = Normal(mu, std) log_prob = n.log_prob(actions) old_log_prob = old_n.log_prob(actions) ratio = torch.exp(log_prob - old_log_prob) L1 = ratio * advantage L2 = torch.clamp(ratio, 0.8, 1.2) * advantage loss = torch.min(L1, L2) loss = - loss.mean() # writer.add_scalar('action loss', loss.item(), steps)optim.zero_grad() loss.backward() optim.step() #clear value_loss = F.mse_loss(value_target, net.value(states)) value_optim.zero_grad() value_loss.backward() value_optim.step() net.replayer.clear() # writer.add_scalar('value loss', value_loss.item(), steps)if i % 10 == 0 and i!=0: print('Epoch:{}, episode reward is {}'.format(i, epoch_reward)) torch.save(net.policy.state_dict(), "pendulun_para\\reward"+str(epoch_reward//10)+'ppo-policy.para') # net.history.log((i * 200), avg_reward=epoch_reward/10) # with net.canvasl: #net.canvasl.draw_plot(net.history["avg_reward"]) epoch_reward = 0


    推荐阅读