import tensorflow as tf
import tensorflow.contrib.layers as tf_layers
import numpy as np
from gym.spaces import Discrete
from stable_baselines.common.policies import BasePolicy, nature_cnn, register_policy
class DQNPolicy(BasePolicy):
"""
Policy object that implements a DQN policy
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param scale: (bool) whether or not to scale the input
:param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder
and the processed observation placeholder respectivly
:param dueling: (bool) if true double the output MLP to compute a baseline for action scores
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False,
obs_phs=None, dueling=True):
# DQN policies need an override for the obs placeholder, due to the architecture of the code
super(DQNPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale,
obs_phs=obs_phs)
assert isinstance(ac_space, Discrete), "Error: the action space for DQN must be of type gym.spaces.Discrete"
self.n_actions = ac_space.n
self.value_fn = None
self.q_values = None
self.dueling = dueling
def _setup_init(self):
"""
Set up action probability
"""
with tf.variable_scope("output", reuse=True):
assert self.q_values is not None
self.policy_proba = tf.nn.softmax(self.q_values)
def step(self, obs, state=None, mask=None, deterministic=True):
"""
Returns the q_values for a single step
:param obs: (np.ndarray float or int) The current observation of the environment
:param state: (np.ndarray float) The last states (used in recurrent policies)
:param mask: (np.ndarray float) The last masks (used in recurrent policies)
:param deterministic: (bool) Whether or not to return deterministic actions.
:return: (np.ndarray int, np.ndarray float, np.ndarray float) actions, q_values, states
"""
raise NotImplementedError
def proba_step(self, obs, state=None, mask=None):
"""
Returns the action probability for a single step
:param obs: (np.ndarray float or int) The current observation of the environment
:param state: (np.ndarray float) The last states (used in recurrent policies)
:param mask: (np.ndarray float) The last masks (used in recurrent policies)
:return: (np.ndarray float) the action probability
"""
raise NotImplementedError
class FeedForwardPolicy(DQNPolicy):
"""
Policy object that implements a DQN policy, using a feed forward neural network.
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64])
:param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction
:param feature_extraction: (str) The feature extraction type ("cnn" or "mlp")
:param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder
and the processed observation placeholder respectivly
:param layer_norm: (bool) enable layer normalisation
:param dueling: (bool) if true double the output MLP to compute a baseline for action scores
:param act_fun: (tf.func) the activation function to use in the neural network.
:param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None,
cnn_extractor=nature_cnn, feature_extraction="cnn",
obs_phs=None, layer_norm=False, dueling=True, act_fun=tf.nn.relu, **kwargs):
super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps,
n_batch, dueling=dueling, reuse=reuse,
scale=(feature_extraction == "cnn"), obs_phs=obs_phs)
self._kwargs_check(feature_extraction, kwargs)
if layers is None:
layers = [64, 64]
with tf.variable_scope("model", reuse=reuse):
with tf.variable_scope("action_value"):
if feature_extraction == "cnn":
extracted_features = cnn_extractor(self.processed_obs, **kwargs)
action_out = extracted_features
else:
extracted_features = tf.layers.flatten(self.processed_obs)
action_out = extracted_features
for layer_size in layers:
action_out = tf_layers.fully_connected(action_out, num_outputs=layer_size, activation_fn=None)
if layer_norm:
action_out = tf_layers.layer_norm(action_out, center=True, scale=True)
action_out = act_fun(action_out)
action_scores = tf_layers.fully_connected(action_out, num_outputs=self.n_actions, activation_fn=None)
if self.dueling:
with tf.variable_scope("state_value"):
state_out = extracted_features
for layer_size in layers:
state_out = tf_layers.fully_connected(state_out, num_outputs=layer_size, activation_fn=None)
if layer_norm:
state_out = tf_layers.layer_norm(state_out, center=True, scale=True)
state_out = act_fun(state_out)
state_score = tf_layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
action_scores_mean = tf.reduce_mean(action_scores, axis=1)
action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, axis=1)
q_out = state_score + action_scores_centered
else:
q_out = action_scores
self.q_values = q_out
self.initial_state = None
self._setup_init()
def step(self, obs, state=None, mask=None, deterministic=True):
q_values, actions_proba = self.sess.run([self.q_values, self.policy_proba], {self.obs_ph: obs})
if deterministic:
actions = np.argmax(q_values, axis=1)
else:
# Unefficient sampling
# TODO: replace the loop
actions = np.zeros((len(obs),), dtype=np.int64)
for action_idx in range(len(obs)):
actions[action_idx] = np.random.choice(self.n_actions, p=actions_proba[action_idx])
return actions, q_values, None
def proba_step(self, obs, state=None, mask=None):
return self.sess.run(self.policy_proba, {self.obs_ph: obs})
[docs]class CnnPolicy(FeedForwardPolicy):
"""
Policy object that implements DQN policy, using a CNN (the nature CNN)
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder
and the processed observation placeholder respectivly
:param dueling: (bool) if true double the output MLP to compute a baseline for action scores
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch,
reuse=False, obs_phs=None, dueling=True, **_kwargs):
super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="cnn", obs_phs=obs_phs, dueling=dueling,
layer_norm=False, **_kwargs)
[docs]class LnCnnPolicy(FeedForwardPolicy):
"""
Policy object that implements DQN policy, using a CNN (the nature CNN), with layer normalisation
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder
and the processed observation placeholder respectivly
:param dueling: (bool) if true double the output MLP to compute a baseline for action scores
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch,
reuse=False, obs_phs=None, dueling=True, **_kwargs):
super(LnCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="cnn", obs_phs=obs_phs, dueling=dueling,
layer_norm=True, **_kwargs)
[docs]class MlpPolicy(FeedForwardPolicy):
"""
Policy object that implements DQN policy, using a MLP (2 layers of 64)
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder
and the processed observation placeholder respectivly
:param dueling: (bool) if true double the output MLP to compute a baseline for action scores
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch,
reuse=False, obs_phs=None, dueling=True, **_kwargs):
super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="mlp", obs_phs=obs_phs, dueling=dueling,
layer_norm=False, **_kwargs)
[docs]class LnMlpPolicy(FeedForwardPolicy):
"""
Policy object that implements DQN policy, using a MLP (2 layers of 64), with layer normalisation
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder
and the processed observation placeholder respectivly
:param dueling: (bool) if true double the output MLP to compute a baseline for action scores
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch,
reuse=False, obs_phs=None, dueling=True, **_kwargs):
super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="mlp", obs_phs=obs_phs,
layer_norm=True, dueling=True, **_kwargs)
register_policy("CnnPolicy", CnnPolicy)
register_policy("LnCnnPolicy", LnCnnPolicy)
register_policy("MlpPolicy", MlpPolicy)
register_policy("LnMlpPolicy", LnMlpPolicy)