python - 让 DQN 学习 CartPole-v1 (PyTorch) 时出现问题

所以我的 DQN 训练很好，在 ~65_000 次迭代后解决了环境问题。然而，我开始做其他事情，但现在它完全崩溃了，再也无法达到相同的水平了。

根据之前工作的建议，我调整了超参数，但仍然没有看到相同的结果。

import gym
import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

from models import DQN
from memory import Memory
from utils import wrap_input, epsilon_greedy

def main() -> int:
    env = gym.make("CartPole-v1")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Online and offline model for learning
    model = DQN(env.observation_space, env.action_space, 24).to(device)
    target = DQN(env.observation_space, env.action_space, 24).to(device)
    target.eval()

    # Optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=.001)
    loss_fn = F.smooth_l1_loss


    memory = Memory(10_000)
    obs, info = env.reset()

    for it in range(65_000):
        # Do this for the batch norm
        model.eval()

        # Maybe explore
        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()
        else:
            action = env.action_space.sample()

        # Act in environment and store the memory
        next_state, reward, done, truncated, info = env.step(action)
        if truncated or done:
            next_state = np.zeros(env.observation_space.shape)
        memory.store([obs, action, reward, int(done), next_state])
        done = done or truncated

        if done:
            obs, info = env.reset()

        # Train
        if len(memory) > 32:
            model.train()
            states, actions, rewards, dones, next_states = memory.sample(32)

            # Wrap and move all values to the cpu
            states = wrap_input(states, device)
            actions = wrap_input(actions, device, torch.int64, reshape=True)
            next_states = wrap_input(next_states, device)
            rewards = wrap_input(rewards, device, reshape=True)
            dones = wrap_input(dones, device, reshape=True)

            # Get current q-values
            qs = model(states)
            qs = torch.gather(qs, dim=1, index=actions)

            # Compute target q-values
            with torch.no_grad():
                next_qs, _ = target(next_states).max(dim=1)
                next_qs = next_qs.reshape(-1, 1)

            target_qs = rewards + .9 * (1 - dones) * next_qs.reshape(-1, 1)

            # Compute loss
            loss = loss_fn(qs, target_qs)
            optimizer.zero_grad()
            loss.backward()
            
            # Clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), 1)

            # Backprop
            optimizer.step()

            # soft update
            with torch.no_grad():
                for target_param, local_param in zip(target.parameters(), model.parameters()):
                    target_param.data.copy_(1e-2 * local_param.data + (1 - 1e-2) * target_param.data)


        if it % 200 == 0:
            target.load_state_dict(model.state_dict())

# models.py
class FlatExtractor(nn.Module):
    '''Does nothing but pass the input on'''
    def __init__(self, obs_space):
        super(FlatExtractor, self).__init__()

        self.n_flatten = obs_space.shape[0]

    def forward(self, obs):
        return obs


class DQN(nn.Module):
    def __init__(self, obs_space, act_space, layer_size):
        super(DQN, self).__init__()

        # Feature extractor
        if len(obs_space.shape) == 1:
            self.feature_extractor = FlatExtractor(obs_space)
        elif len(obs_space.shape) == 3:
            self.feature_extractor = NatureCnn(obs_space)
        else:
            raise NotImplementedErorr("This type of environment is not supported")

        # Neural network
        self.net = nn.Sequential(
            nn.Linear(self.feature_extractor.n_flatten, layer_size),
            nn.BatchNorm1d(layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, layer_size),
            nn.BatchNorm1d(layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, act_space.n),
        )

    def forward(self, obs):
        return self.net(self.feature_extractor(obs))

# memory.py
import random
from collections import deque

class Memory(object):
    def __init__(self, maxlen):
        self.memory = deque(maxlen=maxlen)

    def store(self, experience):
        self.memory.append(experience)

    def sample(self, n_samples):
        return zip(*random.sample(self.memory, n_samples))

    def __len__(self):
        return len(self.memory)

# utils.py
def wrap_input(arr, device, dtype=torch.float, reshape=False):
    output = torch.from_numpy(np.array(arr)).type(dtype).to(device)
    if reshape:
        output = output.reshape(-1, 1)

    return output

def epsilon_greedy(start, end, n_steps, it):
    return max(start - (start - end) * (it / n_steps), end)

有什么我非常想念的吗？我已经尝试训练更长时间了，但没有改变。似乎最大的问题是损失爆炸，甚至更改 tau 进行硬更新似乎也无法解决此问题。

最佳答案

我在运行你的代码时遇到了很多困难，因此我不得不注释掉几件事。我还评论了在调试时增加不必要复杂性的事情，例如，像 cartpole 这样的简单环境不需要目标网络。此外，更多地关注获得的总返回，而不是损失。

我所做的一些主要更改是 -

迭代结束时，next_state 应变为 current_state -

obs = next_state

我交换了您的探索和利用代码

        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()
        else:
            action = env.action_space.sample()

您的代码基本上是通过获取 argmax 开始利用的，一旦 epsilon 值足够低，它就会开始随机采样。这需要交换。

我把它替换为-

        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            action = env.action_space.sample()
            

            

        else:
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()

我增加了您的批量大小。 cartpole 中的批量大小较大，可显着加快训练速度 -

states, actions, rewards, dones, next_states = memory.sample(128)

此外，最好等待模型获得足够的经验后再开始训练 -

        if len(memory) > 500:
            model.train()
            states, actions, rewards, dones, next_states = memory.sample(128)

我所做的其他更改是为了简化调试。

我没有看到 class FlatExtractor(nn.Module) 的任何用途，因此我将其删除并进行了以下更改 -

        if len(obs_space.shape) == 1:
            self.feature_extractor = env.observation_space.shape[0]

    def forward(self, obs):

        return self.net(obs)

我删除了 BatchNorm 的所有实例

用 MSELoss 替换损失并删除剪辑梯度

loss_fn = nn.MSELoss()

将学习率更改为 lr=.0001

增加神经网络的宽度 -

model = DQN(env.observation_space, env.action_space, 128).to(device)

删除了目标网络及其相应的软更新。

添加到总奖励中以检查算法是否正在学习

    tot_rew = 0
    for it in range(65_000):

        next_state, reward, done, info = env.step(action)
        tot_rew += reward

        if done:
            print("tot_rew = ", tot_rew)
            obs= env.reset()
            tot_rew = 0

这是我最终获得的总奖励 -

tot_rew =  228.0
tot_rew =  472.0
tot_rew =  243.0
tot_rew =  300.0

这是完整的固定代码-

import gym
import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

env = gym.make("CartPole-v1")
def main() -> int:
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Online and offline model for learning

    model = DQN(env.observation_space, env.action_space, 128).to(device)

    target = DQN(env.observation_space, env.action_space, 24).to(device)

    # target.eval()

    # Optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=.0001)
    loss_fn = nn.MSELoss()


    memory = Memory(10_000)

    obs = env.reset()
    tot_rew = 0
    for it in range(65_000):
        # print("it = ", it)
        # Do this for the batch norm
        # model.eval()

        # Maybe explore
        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            action = env.action_space.sample()
            

            

        else:
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()
            


            # print("epsilon_greedy(1.0, .01, 15_000, it) = ", epsilon_greedy(1.0, .01, 15_000, it))
            
            # print("check = ", model(state).detach().numpy())
            # print("action = ", action)
            


        # Act in environment and store the memory

        next_state, reward, done, info = env.step(action)
        tot_rew += reward
        if done:
            next_state = np.zeros(env.observation_space.shape)
        memory.store([obs, action, reward, int(done), next_state])
        done = done

        obs = next_state

        if done:
            print("tot_rew = ", tot_rew)
            obs= env.reset()
            tot_rew = 0

        # Train
        if len(memory) > 500:
            model.train()
            states, actions, rewards, dones, next_states = memory.sample(128)

            # Wrap and move all values to the cpu

            states = wrap_input(states, device)
            # print("states.shape = ",states.shape)
            actions = wrap_input(actions, device, torch.int64, reshape=True)
            next_states = wrap_input(next_states, device)
            rewards = wrap_input(rewards, device, reshape=True)
            dones = wrap_input(dones, device, reshape=True)

            # Get current q-values
            qs = model(states)
            # print("qs.shape = ", qs.shape)
            qs = torch.gather(qs, dim=1, index=actions)

            # Compute target q-values
            with torch.no_grad():
                next_qs, _ = model(next_states).max(dim=1)
                next_qs = next_qs.reshape(-1, 1)

            target_qs = rewards + .9 * (1 - dones) * next_qs.reshape(-1, 1)

            # Compute loss
            loss = loss_fn(qs, target_qs)
            # print("loss.shape = ", loss)
            optimizer.zero_grad()
            loss.backward()
            
            # Clip gradients
            # nn.utils.clip_grad_norm_(model.parameters(), 1)

            # Backprop
            optimizer.step()

            # soft update
        #     with torch.no_grad():
        #         for target_param, local_param in zip(target.parameters(), model.parameters()):
        #             target_param.data.copy_(1e-2 * local_param.data + (1 - 1e-2) * target_param.data)


        # if it % 200 == 0:
        #     target.load_state_dict(model.state_dict())

# models.py
class FlatExtractor(nn.Module):
    '''Does nothing but pass the input on'''
    def __init__(self, obs_space):
        super(FlatExtractor, self).__init__()

        self.n_flatten = 1

    def forward(self, obs):
        return obs


class DQN(nn.Module):
    def __init__(self, obs_space, act_space, layer_size):
        super(DQN, self).__init__()

        # Feature extractor
        if len(obs_space.shape) == 1:
            self.feature_extractor = env.observation_space.shape[0]

        elif len(obs_space.shape) == 3:
            self.feature_extractor = NatureCnn(obs_space)
        else:
            raise NotImplementedErorr("This type of environment is not supported")
        

        # Neural network
        self.net = nn.Sequential(
            nn.Linear(self.feature_extractor, layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, act_space.n),
        )

    def forward(self, obs):

        return self.net(obs)

# memory.py
import random
from collections import deque

class Memory(object):
    def __init__(self, maxlen):
        self.memory = deque(maxlen=maxlen)

    def store(self, experience):
        self.memory.append(experience)

    def sample(self, n_samples):
        return zip(*random.sample(self.memory, n_samples))

    def __len__(self):
        return len(self.memory)

# utils.py
def wrap_input(arr, device, dtype=torch.float, reshape=False):
    output = torch.from_numpy(np.array(arr)).type(dtype).to(device)
    if reshape:
        output = output.reshape(-1, 1)

    return output

def epsilon_greedy(start, end, n_steps, it):
    return max(start - (start - end) * (it / n_steps), end)

main()

关于python - 让 DQN 学习 CartPole-v1 (PyTorch) 时出现问题，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/75179713/

python - 让 DQN 学习 CartPole-v1 (PyTorch) 时出现问题

上一篇：mysql - 如何使用mysql更新列中的json数据

下一篇：reactjs - 如何设置 jest 在 create-react-app 项目中使用自定义 babel 插件？