python - 让 DQN 学习 CartPole-v1 (PyTorch) 时出现问题

标签 python pytorch reinforcement-learning openai-gym

所以我的 DQN 训练很好,在 ~65_000 次迭代后解决了环境问题。然而,我开始做其他事情,但现在它完全崩溃了,再也无法达到相同的水平了。

根据之前工作的建议,我调整了超参数,但仍然没有看到相同的结果。

import gym
import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

from models import DQN
from memory import Memory
from utils import wrap_input, epsilon_greedy

def main() -> int:
    env = gym.make("CartPole-v1")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Online and offline model for learning
    model = DQN(env.observation_space, env.action_space, 24).to(device)
    target = DQN(env.observation_space, env.action_space, 24).to(device)
    target.eval()

    # Optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=.001)
    loss_fn = F.smooth_l1_loss


    memory = Memory(10_000)
    obs, info = env.reset()

    for it in range(65_000):
        # Do this for the batch norm
        model.eval()

        # Maybe explore
        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()
        else:
            action = env.action_space.sample()

        # Act in environment and store the memory
        next_state, reward, done, truncated, info = env.step(action)
        if truncated or done:
            next_state = np.zeros(env.observation_space.shape)
        memory.store([obs, action, reward, int(done), next_state])
        done = done or truncated

        if done:
            obs, info = env.reset()

        # Train
        if len(memory) > 32:
            model.train()
            states, actions, rewards, dones, next_states = memory.sample(32)

            # Wrap and move all values to the cpu
            states = wrap_input(states, device)
            actions = wrap_input(actions, device, torch.int64, reshape=True)
            next_states = wrap_input(next_states, device)
            rewards = wrap_input(rewards, device, reshape=True)
            dones = wrap_input(dones, device, reshape=True)

            # Get current q-values
            qs = model(states)
            qs = torch.gather(qs, dim=1, index=actions)

            # Compute target q-values
            with torch.no_grad():
                next_qs, _ = target(next_states).max(dim=1)
                next_qs = next_qs.reshape(-1, 1)

            target_qs = rewards + .9 * (1 - dones) * next_qs.reshape(-1, 1)

            # Compute loss
            loss = loss_fn(qs, target_qs)
            optimizer.zero_grad()
            loss.backward()
            
            # Clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), 1)

            # Backprop
            optimizer.step()

            # soft update
            with torch.no_grad():
                for target_param, local_param in zip(target.parameters(), model.parameters()):
                    target_param.data.copy_(1e-2 * local_param.data + (1 - 1e-2) * target_param.data)


        if it % 200 == 0:
            target.load_state_dict(model.state_dict())

# models.py
class FlatExtractor(nn.Module):
    '''Does nothing but pass the input on'''
    def __init__(self, obs_space):
        super(FlatExtractor, self).__init__()

        self.n_flatten = obs_space.shape[0]

    def forward(self, obs):
        return obs


class DQN(nn.Module):
    def __init__(self, obs_space, act_space, layer_size):
        super(DQN, self).__init__()

        # Feature extractor
        if len(obs_space.shape) == 1:
            self.feature_extractor = FlatExtractor(obs_space)
        elif len(obs_space.shape) == 3:
            self.feature_extractor = NatureCnn(obs_space)
        else:
            raise NotImplementedErorr("This type of environment is not supported")

        # Neural network
        self.net = nn.Sequential(
            nn.Linear(self.feature_extractor.n_flatten, layer_size),
            nn.BatchNorm1d(layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, layer_size),
            nn.BatchNorm1d(layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, act_space.n),
        )

    def forward(self, obs):
        return self.net(self.feature_extractor(obs))

# memory.py
import random
from collections import deque

class Memory(object):
    def __init__(self, maxlen):
        self.memory = deque(maxlen=maxlen)

    def store(self, experience):
        self.memory.append(experience)

    def sample(self, n_samples):
        return zip(*random.sample(self.memory, n_samples))

    def __len__(self):
        return len(self.memory)

# utils.py
def wrap_input(arr, device, dtype=torch.float, reshape=False):
    output = torch.from_numpy(np.array(arr)).type(dtype).to(device)
    if reshape:
        output = output.reshape(-1, 1)

    return output

def epsilon_greedy(start, end, n_steps, it):
    return max(start - (start - end) * (it / n_steps), end)

有什么我非常想念的吗?我已经尝试训练更长时间了,但没有改变。似乎最大的问题是损失爆炸,甚至更改 tau 进行硬更新似乎也无法解决此问题。

最佳答案

我在运行你的代码时遇到了很多困难,因此我不得不注释掉几件事。我还评论了在调试时增加不必要复杂性的事情,例如,像 cartpole 这样的简单环境不需要目标网络。此外,更多地关注获得的总返回,而不是损失。

我所做的一些主要更改是 -

  • 迭代结束时,next_state 应变为 current_state -
  • obs = next_state
    
    1. 我交换了您的探索和利用代码
            if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
                state = wrap_input(obs, device).unsqueeze(0)
                action  = model(state).argmax().item()
            else:
                action = env.action_space.sample()
    

    您的代码基本上是通过获取 argmax 开始利用的,一旦 epsilon 值足够低,它就会开始随机采样。这需要交换。

    我把它替换为-

            if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
                action = env.action_space.sample()
                
    
                
    
            else:
                state = wrap_input(obs, device).unsqueeze(0)
                action  = model(state).argmax().item()
    
  • 我增加了您的批量大小。 cartpole 中的批量大小较大,可显着加快训练速度 -
  • states, actions, rewards, dones, next_states = memory.sample(128)
    
  • 此外,最好等待模型获得足够的经验后再开始训练 -
  •         if len(memory) > 500:
                model.train()
                states, actions, rewards, dones, next_states = memory.sample(128)
    

    我所做的其他更改是为了简化调试。

  • 我没有看到 class FlatExtractor(nn.Module) 的任何用途,因此我将其删除并进行了以下更改 -
  •         if len(obs_space.shape) == 1:
                self.feature_extractor = env.observation_space.shape[0]
    
        def forward(self, obs):
    
            return self.net(obs)
    
  • 我删除了 BatchNorm 的所有实例

  • 用 MSELoss 替换损失并删除剪辑梯度

  • loss_fn = nn.MSELoss()
    
  • 将学习率更改为 lr=.0001

  • 增加神经网络的宽度 -

  • model = DQN(env.observation_space, env.action_space, 128).to(device)
    
  • 删除了目标网络及其相应的软更新。

  • 添加到总奖励中以检查算法是否正在学习

  •     tot_rew = 0
        for it in range(65_000):
    
            next_state, reward, done, info = env.step(action)
            tot_rew += reward
    
            if done:
                print("tot_rew = ", tot_rew)
                obs= env.reset()
                tot_rew = 0
    

    这是我最终获得的总奖励 -

    tot_rew =  228.0
    tot_rew =  472.0
    tot_rew =  243.0
    tot_rew =  300.0
    

    这是完整的固定代码-

    import gym
    import numpy as np
    
    import torch
    from torch import nn
    from torch.nn import functional as F
    from torch import optim
    
    env = gym.make("CartPole-v1")
    def main() -> int:
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
        # Online and offline model for learning
    
        model = DQN(env.observation_space, env.action_space, 128).to(device)
    
        target = DQN(env.observation_space, env.action_space, 24).to(device)
    
        # target.eval()
    
        # Optimizer and loss function
        optimizer = optim.Adam(model.parameters(), lr=.0001)
        loss_fn = nn.MSELoss()
    
    
        memory = Memory(10_000)
    
        obs = env.reset()
        tot_rew = 0
        for it in range(65_000):
            # print("it = ", it)
            # Do this for the batch norm
            # model.eval()
    
            # Maybe explore
            if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
                action = env.action_space.sample()
                
    
                
    
            else:
                state = wrap_input(obs, device).unsqueeze(0)
                action  = model(state).argmax().item()
                
    
    
                # print("epsilon_greedy(1.0, .01, 15_000, it) = ", epsilon_greedy(1.0, .01, 15_000, it))
                
                # print("check = ", model(state).detach().numpy())
                # print("action = ", action)
                
    
    
            # Act in environment and store the memory
    
            next_state, reward, done, info = env.step(action)
            tot_rew += reward
            if done:
                next_state = np.zeros(env.observation_space.shape)
            memory.store([obs, action, reward, int(done), next_state])
            done = done
    
            obs = next_state
    
            if done:
                print("tot_rew = ", tot_rew)
                obs= env.reset()
                tot_rew = 0
    
            # Train
            if len(memory) > 500:
                model.train()
                states, actions, rewards, dones, next_states = memory.sample(128)
    
                # Wrap and move all values to the cpu
    
                states = wrap_input(states, device)
                # print("states.shape = ",states.shape)
                actions = wrap_input(actions, device, torch.int64, reshape=True)
                next_states = wrap_input(next_states, device)
                rewards = wrap_input(rewards, device, reshape=True)
                dones = wrap_input(dones, device, reshape=True)
    
                # Get current q-values
                qs = model(states)
                # print("qs.shape = ", qs.shape)
                qs = torch.gather(qs, dim=1, index=actions)
    
                # Compute target q-values
                with torch.no_grad():
                    next_qs, _ = model(next_states).max(dim=1)
                    next_qs = next_qs.reshape(-1, 1)
    
                target_qs = rewards + .9 * (1 - dones) * next_qs.reshape(-1, 1)
    
                # Compute loss
                loss = loss_fn(qs, target_qs)
                # print("loss.shape = ", loss)
                optimizer.zero_grad()
                loss.backward()
                
                # Clip gradients
                # nn.utils.clip_grad_norm_(model.parameters(), 1)
    
                # Backprop
                optimizer.step()
    
                # soft update
            #     with torch.no_grad():
            #         for target_param, local_param in zip(target.parameters(), model.parameters()):
            #             target_param.data.copy_(1e-2 * local_param.data + (1 - 1e-2) * target_param.data)
    
    
            # if it % 200 == 0:
            #     target.load_state_dict(model.state_dict())
    
    # models.py
    class FlatExtractor(nn.Module):
        '''Does nothing but pass the input on'''
        def __init__(self, obs_space):
            super(FlatExtractor, self).__init__()
    
            self.n_flatten = 1
    
        def forward(self, obs):
            return obs
    
    
    class DQN(nn.Module):
        def __init__(self, obs_space, act_space, layer_size):
            super(DQN, self).__init__()
    
            # Feature extractor
            if len(obs_space.shape) == 1:
                self.feature_extractor = env.observation_space.shape[0]
    
            elif len(obs_space.shape) == 3:
                self.feature_extractor = NatureCnn(obs_space)
            else:
                raise NotImplementedErorr("This type of environment is not supported")
            
    
            # Neural network
            self.net = nn.Sequential(
                nn.Linear(self.feature_extractor, layer_size),
                nn.ReLU(),
                nn.Linear(layer_size, layer_size),
                nn.ReLU(),
                nn.Linear(layer_size, act_space.n),
            )
    
        def forward(self, obs):
    
            return self.net(obs)
    
    # memory.py
    import random
    from collections import deque
    
    class Memory(object):
        def __init__(self, maxlen):
            self.memory = deque(maxlen=maxlen)
    
        def store(self, experience):
            self.memory.append(experience)
    
        def sample(self, n_samples):
            return zip(*random.sample(self.memory, n_samples))
    
        def __len__(self):
            return len(self.memory)
    
    # utils.py
    def wrap_input(arr, device, dtype=torch.float, reshape=False):
        output = torch.from_numpy(np.array(arr)).type(dtype).to(device)
        if reshape:
            output = output.reshape(-1, 1)
    
        return output
    
    def epsilon_greedy(start, end, n_steps, it):
        return max(start - (start - end) * (it / n_steps), end)
    
    main()
    

    关于python - 让 DQN 学习 CartPole-v1 (PyTorch) 时出现问题,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/75179713/

    相关文章:

    python - 如何创建类似于符号表的东西

    python - 如何将累积数据转换为每日数据?

    pytorch - 在pytorch中使用索引提取张量数据

    pytorch - pytorch 代码中的 KL 散度与公式有何关系?

    algorithm - 什么时候使用某种强化学习算法?

    python - SQLAlchemy 查询在多个 Tornado 实例之间不一致

    python - 使用 Python 格式化 HTML

    python - Torch:稀疏矩阵乘法

    machine-learning - 在强化学习中设置 gamma 和 lambda