python - Deep Q Network 无法解决 OpenAI CartPole

标签 python tensorflow machine-learning reinforcement-learning openai-gym

我试图实现一个 DQN 来解决 OpenAI Gym 中的 CartPole-v0 任务。不幸的是,我的实现的性能似乎没有提高。

目前,随着训练的进行,剧集奖励实际上会减少,而目标是找到更好的策略来增加该值。

我正在使用经验回放和单独的目标网络来备份我的 q 值。我尝试添加/删除代理中的层和神经元;这不起作用。我改变了降低探索率的时间表;这也不起作用。我越来越确信我的损失函数出了问题,但我不确定如何改变它以提高性能。

这是我的损失函数代码:

with tf.variable_scope('loss'):
            one_hot_mask = self.one_hot_actions
            eval = tf.reduce_max(self.q * one_hot_mask, axis=1)
            print(eval)
            trg = tf.reduce_max(self.q_targ, axis = 1) * self.gamma
            print(trg)
            label = trg + self.rewards
            self.loss = tf.reduce_mean(tf.square(label - eval))

其中 one_hot_actions 是一个占位符,定义为:

        self.one_hot_actions = tf.placeholder(tf.float32, [None, self.env.action_space.n], 'one_hot_actions')

这是我的完整代码:

import tensorflow as tf
import numpy as np
import gym
import sys
import random
import math
import matplotlib.pyplot as plt

class Experience(object):
    """Experience buffer for experience replay"""
    def __init__(self, size):
        super(Experience, self).__init__()
        self.size = size
        self.memory = []
    def add(self, sample):
        self.memory.append(sample)
        if len(self.memory) > self.size:
            self.memory.pop(0)

class Agent(object):
    def __init__(self, env, ep_max, ep_len, gamma, lr, batch, epochs, s_dim, minibatch_size):
        super(Agent, self).__init__()
        self.ep_max = ep_max
        self.ep_len = ep_len
        self.gamma = gamma
        self.experience = Experience(100)
        self.lr = lr
        self.batch = batch
        self.minibatch_size = minibatch_size
        self.epochs = epochs
        self.s_dim = s_dim
        self.sess = tf.Session()
        self.env = gym.make(env).unwrapped

        self.state_0s = tf.placeholder(tf.float32, [None, self.s_dim], 'state_0s')
        self.actions = tf.placeholder(tf.int32, [None, 1], 'actions')
        self.rewards = tf.placeholder(tf.float32, [None, 1], 'rewards')
        self.states = tf.placeholder(tf.float32, [None, self.s_dim], 'states')

        self.one_hot_actions = tf.placeholder(tf.float32, [None, self.env.action_space.n], 'one_hot_actions')

        # q nets
        self.q, q_params = self.build_dqn('primary', trainable=True)
        self.q_targ, q_targ_params = self.build_dqn('target', trainable=False)

        with tf.variable_scope('update_target'):
            self.update_target_op = [targ_p.assign(p) for p, targ_p in zip(q_params, q_targ_params)]

        with tf.variable_scope('loss'):
            one_hot_mask = self.one_hot_actions
            eval = tf.reduce_max(self.q * one_hot_mask, axis=1)
            print(eval)
            trg = tf.reduce_max(self.q_targ, axis = 1) * self.gamma
            print(trg)
            label = trg + self.rewards
            self.loss = tf.reduce_mean(tf.square(label - eval))

        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        tf.summary.FileWriter("log/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

    def build_dqn(self, name, trainable):
        with tf.variable_scope(name):
            if name == "primary":
                l1 = tf.layers.dense(self.state_0s, 100, tf.nn.relu, trainable=trainable)
            else:
                l1 = tf.layers.dense(self.states, 100, tf.nn.relu, trainable=trainable)
            l2 = tf.layers.dense(l1, 50, tf.nn.relu, trainable=trainable)
            q = tf.layers.dense(l2, self.env.action_space.n, trainable=trainable)
        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
        return q, params

    def choose_action(self, s, t):
        s = s[np.newaxis, :]
        if random.uniform(0,1) < self.get_explore_rate(t):
            a = self.env.action_space.sample()
        else:
            a = np.argmax(self.sess.run(self.q, {self.state_0s: s})[0])
        return a

    def get_explore_rate(self, t):
        return max(0.01, min(1, 1.0 - math.log10((t+1)/25)))

    def update(self):
        # experience is [ [s_0, a, r, s], [s_0, a, r, s], ... ]
        self.sess.run(self.update_target_op)
        indices = np.random.choice(range(len(self.experience.memory)), self.batch)
        # indices = range(len(experience))
        state_0 = [self.experience.memory[index][0] for index in indices]
        a =  [self.experience.memory[index][1] for index in indices]
        rs = [self.experience.memory[index][2] for index in indices]
        state = [self.experience.memory[index][3] for index in indices]

        [self.sess.run(self.train_op, feed_dict = {self.state_0s: state_0,
            self.one_hot_actions: a, self.rewards: np.asarray(rs).reshape([-1,1]), self.states: state}) for _ in range(self.epochs)]

    def run(self):
        all_ep_r = []
        for ep in range(self.ep_max):
            s_0 = self.env.reset()
            ep_r = 0
            for t in range(self.ep_len):
                fake_ac = [0.0, 0.0] # used to make one hot actions
                # self.env.render()
                a = self.choose_action(s_0, ep)
                s, r, done, _ = self.env.step(a)
                if done:
                    s = np.zeros(np.shape(s_0))
                fake_ac[a] = 1.0
                print(fake_ac)
                self.experience.add([s_0, fake_ac, r, s])
                s_0 = s
                ep_r += r

                if done:
                    break

            all_ep_r.append(ep_r)
            print(
                'Ep: %i' % ep,
                "|Ep_r: %i" % ep_r,
            )
            if len(self.experience.memory) > self.batch -1:
                self.update()
        return all_ep_r

agent = Agent("CartPole-v0", 200, 200, 0.99, 0.00025, 32, 10, 4, 16)
all_ep_r = agent.run()
plt.plot(range(len(all_ep_r)), all_ep_r)
plt.show()

最佳答案

西蒙的评论是正确的。您的损失函数代码不正确,因为您没有考虑最终状态。

如果并且仅状态是非终结的,则目标trg应该是reward + gamma * Q

如果状态是终止的(杆子掉落并且游戏结束),那么它只是奖励

关于python - Deep Q Network 无法解决 OpenAI CartPole,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50128159/

相关文章:

python - 如何将文本添加到字符串中一行的末尾? - Python

python - TFLearn 导入错误

python - 将整个字符串转换为虚拟变量

machine-learning - AWS Sagemaker BlazingText 多个训练文件

python - 无法在 dask worker 中加载模块

python - PyQt5 调整应用程序大小以适应不同的显示器

python - pyqt QThread阻塞主线程

tensorflow - TensorFlow 概率中联合分布的组成

python - Keras 多类分类预测始终相同的标签

python - 如何快速验证 CNN 是否真正学习?