import tensorflow as tf
import numpy as np
from gym.spaces import Box
from stable_baselines.common.policies import BasePolicy, nature_cnn, register_policy
from stable_baselines.sac.policies import mlp
class TD3Policy(BasePolicy):
"""
Policy object that implements a TD3-like actor critic
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param scale: (bool) whether or not to scale the input
"""
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False):
super(TD3Policy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale)
assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box"
self.qf1 = None
self.qf2 = None
self.policy = None
def make_actor(self, obs=None, reuse=False, scope="pi"):
"""
Creates an actor object
:param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder)
:param reuse: (bool) whether or not to reuse parameters
:param scope: (str) the scope name of the actor
:return: (TensorFlow Tensor) the output tensor
"""
raise NotImplementedError
def make_critics(self, obs=None, action=None, reuse=False,
scope="qvalues_fn"):
"""
Creates the two Q-Values approximator
:param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder)
:param action: (TensorFlow Tensor) The action placeholder
:param reuse: (bool) whether or not to reuse parameters
:param scope: (str) the scope name
:return: ([tf.Tensor]) Mean, action and log probability
"""
raise NotImplementedError
def step(self, obs, state=None, mask=None):
"""
Returns the policy for a single step
:param obs: ([float] or [int]) The current observation of the environment
:param state: ([float]) The last states (used in recurrent policies)
:param mask: ([float]) The last masks (used in recurrent policies)
:return: ([float]) actions
"""
raise NotImplementedError
def proba_step(self, obs, state=None, mask=None):
"""
Returns the policy for a single step
:param obs: ([float] or [int]) The current observation of the environment
:param state: ([float]) The last states (used in recurrent policies)
:param mask: ([float]) The last masks (used in recurrent policies)
:return: ([float]) actions
"""
return self.step(obs, state, mask)
class FeedForwardPolicy(TD3Policy):
"""
Policy object that implements a DDPG-like actor critic, using a feed forward neural network.
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64])
:param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction
:param feature_extraction: (str) The feature extraction type ("cnn" or "mlp")
:param layer_norm: (bool) enable layer normalisation
:param act_fun: (tf.func) the activation function to use in the neural network.
:param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, layers=None,
cnn_extractor=nature_cnn, feature_extraction="cnn",
layer_norm=False, act_fun=tf.nn.relu, **kwargs):
super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch,
reuse=reuse, scale=(feature_extraction == "cnn"))
self._kwargs_check(feature_extraction, kwargs)
self.layer_norm = layer_norm
self.feature_extraction = feature_extraction
self.cnn_kwargs = kwargs
self.cnn_extractor = cnn_extractor
self.reuse = reuse
if layers is None:
layers = [64, 64]
self.layers = layers
assert len(layers) >= 1, "Error: must have at least one hidden layer for the policy."
self.activ_fn = act_fun
def make_actor(self, obs=None, reuse=False, scope="pi"):
if obs is None:
obs = self.processed_obs
with tf.variable_scope(scope, reuse=reuse):
if self.feature_extraction == "cnn":
pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
else:
pi_h = tf.layers.flatten(obs)
pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
self.policy = policy = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=tf.tanh)
return policy
def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn"):
if obs is None:
obs = self.processed_obs
with tf.variable_scope(scope, reuse=reuse):
if self.feature_extraction == "cnn":
critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
else:
critics_h = tf.layers.flatten(obs)
# Concatenate preprocessed state and action
qf_h = tf.concat([critics_h, action], axis=-1)
# Double Q values to reduce overestimation
with tf.variable_scope('qf1', reuse=reuse):
qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
qf1 = tf.layers.dense(qf1_h, 1, name="qf1")
with tf.variable_scope('qf2', reuse=reuse):
qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm)
qf2 = tf.layers.dense(qf2_h, 1, name="qf2")
self.qf1 = qf1
self.qf2 = qf2
return self.qf1, self.qf2
def step(self, obs, state=None, mask=None):
return self.sess.run(self.policy, {self.obs_ph: obs})
[docs]class CnnPolicy(FeedForwardPolicy):
"""
Policy object that implements actor critic, using a CNN (the nature CNN)
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs):
super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="cnn", **_kwargs)
[docs]class LnCnnPolicy(FeedForwardPolicy):
"""
Policy object that implements actor critic, using a CNN (the nature CNN), with layer normalisation
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs):
super(LnCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="cnn", layer_norm=True, **_kwargs)
[docs]class MlpPolicy(FeedForwardPolicy):
"""
Policy object that implements actor critic, using a MLP (2 layers of 64)
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs):
super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="mlp", **_kwargs)
[docs]class LnMlpPolicy(FeedForwardPolicy):
"""
Policy object that implements actor critic, using a MLP (2 layers of 64), with layer normalisation
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs):
super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="mlp", layer_norm=True, **_kwargs)
register_policy("CnnPolicy", CnnPolicy)
register_policy("LnCnnPolicy", LnCnnPolicy)
register_policy("MlpPolicy", MlpPolicy)
register_policy("LnMlpPolicy", LnMlpPolicy)