import tensorflow as tf
import numpy as np
from gym.spaces import Box
from stable_baselines.common.policies import BasePolicy, nature_cnn, register_policy
class DDPGPolicy(BasePolicy):
"""
Policy object that implements a DDPG-like actor critic
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param scale: (bool) whether or not to scale the input
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False):
super(DDPGPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale,
add_action_ph=True)
assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box"
assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric"
self.qvalue_fn = None
self.policy = None
def make_actor(self, obs=None, reuse=False, scope="pi"):
"""
creates an actor object
:param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder)
:param reuse: (bool) whether or not to resue parameters
:param scope: (str) the scope name of the actor
:return: (TensorFlow Tensor) the output tensor
"""
raise NotImplementedError
def make_critic(self, obs=None, action=None, reuse=False, scope="qf"):
"""
creates a critic object
:param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder)
:param action: (TensorFlow Tensor) The action placeholder (can be None for default placeholder)
:param reuse: (bool) whether or not to resue parameters
:param scope: (str) the scope name of the critic
:return: (TensorFlow Tensor) the output tensor
"""
raise NotImplementedError
def step(self, obs, state=None, mask=None):
"""
Returns the policy for a single step
:param obs: ([float] or [int]) The current observation of the environment
:param state: ([float]) The last states (used in recurrent policies)
:param mask: ([float]) The last masks (used in recurrent policies)
:return: ([float]) actions
"""
raise NotImplementedError
def proba_step(self, obs, state=None, mask=None):
"""
Returns the action probability for a single step
:param obs: ([float] or [int]) The current observation of the environment
:param state: ([float]) The last states (used in recurrent policies)
:param mask: ([float]) The last masks (used in recurrent policies)
:return: ([float]) the action probability
"""
raise NotImplementedError
def value(self, obs, action, state=None, mask=None):
"""
Returns the value for a single step
:param obs: ([float] or [int]) The current observation of the environment
:param action: ([float] or [int]) The taken action
:param state: ([float]) The last states (used in recurrent policies)
:param mask: ([float]) The last masks (used in recurrent policies)
:return: ([float]) The associated value of the action
"""
raise NotImplementedError
class FeedForwardPolicy(DDPGPolicy):
"""
Policy object that implements a DDPG-like actor critic, using a feed forward neural network.
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64])
:param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction
:param feature_extraction: (str) The feature extraction type ("cnn" or "mlp")
:param layer_norm: (bool) enable layer normalisation
:param act_fun: (tf.func) the activation function to use in the neural network.
:param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None,
cnn_extractor=nature_cnn, feature_extraction="cnn",
layer_norm=False, act_fun=tf.nn.relu, **kwargs):
super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse,
scale=(feature_extraction == "cnn"))
self._kwargs_check(feature_extraction, kwargs)
self.layer_norm = layer_norm
self.feature_extraction = feature_extraction
self.cnn_kwargs = kwargs
self.cnn_extractor = cnn_extractor
self.reuse = reuse
self._qvalue = None
if layers is None:
layers = [64, 64]
self.layers = layers
assert len(layers) >= 1, "Error: must have at least one hidden layer for the policy."
self.activ = act_fun
def make_actor(self, obs=None, reuse=False, scope="pi"):
if obs is None:
obs = self.processed_obs
with tf.variable_scope(scope, reuse=reuse):
if self.feature_extraction == "cnn":
pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
else:
pi_h = tf.layers.flatten(obs)
for i, layer_size in enumerate(self.layers):
pi_h = tf.layers.dense(pi_h, layer_size, name='fc' + str(i))
if self.layer_norm:
pi_h = tf.contrib.layers.layer_norm(pi_h, center=True, scale=True)
pi_h = self.activ(pi_h)
self.policy = tf.nn.tanh(tf.layers.dense(pi_h, self.ac_space.shape[0], name=scope,
kernel_initializer=tf.random_uniform_initializer(minval=-3e-3,
maxval=3e-3)))
return self.policy
def make_critic(self, obs=None, action=None, reuse=False, scope="qf"):
if obs is None:
obs = self.processed_obs
if action is None:
action = self.action_ph
with tf.variable_scope(scope, reuse=reuse):
if self.feature_extraction == "cnn":
qf_h = self.cnn_extractor(obs, **self.cnn_kwargs)
else:
qf_h = tf.layers.flatten(obs)
for i, layer_size in enumerate(self.layers):
qf_h = tf.layers.dense(qf_h, layer_size, name='fc' + str(i))
if self.layer_norm:
qf_h = tf.contrib.layers.layer_norm(qf_h, center=True, scale=True)
qf_h = self.activ(qf_h)
if i == 0:
qf_h = tf.concat([qf_h, action], axis=-1)
# the name attribute is used in pop-art normalization
qvalue_fn = tf.layers.dense(qf_h, 1, name='qf_output',
kernel_initializer=tf.random_uniform_initializer(minval=-3e-3,
maxval=3e-3))
self.qvalue_fn = qvalue_fn
self._qvalue = qvalue_fn[:, 0]
return self.qvalue_fn
def step(self, obs, state=None, mask=None):
return self.sess.run(self.policy, {self.obs_ph: obs})
def proba_step(self, obs, state=None, mask=None):
return self.sess.run(self.policy, {self.obs_ph: obs})
def value(self, obs, action, state=None, mask=None):
return self.sess.run(self._qvalue, {self.obs_ph: obs, self.action_ph: action})
[docs]class CnnPolicy(FeedForwardPolicy):
"""
Policy object that implements actor critic, using a CNN (the nature CNN)
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="cnn", **_kwargs)
[docs]class LnCnnPolicy(FeedForwardPolicy):
"""
Policy object that implements actor critic, using a CNN (the nature CNN), with layer normalisation
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
super(LnCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="cnn", layer_norm=True, **_kwargs)
[docs]class MlpPolicy(FeedForwardPolicy):
"""
Policy object that implements actor critic, using a MLP (2 layers of 64)
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="mlp", **_kwargs)
[docs]class LnMlpPolicy(FeedForwardPolicy):
"""
Policy object that implements actor critic, using a MLP (2 layers of 64), with layer normalisation
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="mlp", layer_norm=True, **_kwargs)
register_policy("CnnPolicy", CnnPolicy)
register_policy("LnCnnPolicy", LnCnnPolicy)
register_policy("MlpPolicy", MlpPolicy)
register_policy("LnMlpPolicy", LnMlpPolicy)