Source code for stable_baselines.common.evaluation

import numpy as np

from stable_baselines.common.vec_env import VecEnv


[docs]def evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, render=False, callback=None, reward_threshold=None, return_episode_rewards=False): """ Runs policy for `n_eval_episodes` episodes and returns average reward. This is made to work only with one env. :param model: (BaseRLModel) The RL agent you want to evaluate. :param env: (gym.Env or VecEnv) The gym environment. In the case of a `VecEnv` this must contain only one environment. :param n_eval_episodes: (int) Number of episode to evaluate the agent :param deterministic: (bool) Whether to use deterministic or stochastic actions :param render: (bool) Whether to render the environment or not :param callback: (callable) callback function to do additional checks, called after each step. :param reward_threshold: (float) Minimum expected reward per episode, this will raise an error if the performance is not met :param return_episode_rewards: (bool) If True, a list of reward per episode will be returned instead of the mean. :return: (float, int) Mean reward per episode, total number of steps returns ([float], int) when `return_episode_rewards` is True """ if isinstance(env, VecEnv): assert env.num_envs == 1, "You must pass only one environment when using this function" episode_rewards, n_steps = [], 0 for _ in range(n_eval_episodes): obs = env.reset() done, state = False, None episode_reward = 0.0 while not done: action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, _info = env.step(action) episode_reward += reward if callback is not None: callback(locals(), globals()) n_steps += 1 if render: env.render() episode_rewards.append(episode_reward) mean_reward = np.mean(episode_rewards) if reward_threshold is not None: assert mean_reward > reward_threshold, 'Mean reward below threshold: '\ '{:.2f} < {:.2f}'.format(mean_reward, reward_threshold) if return_episode_rewards: return episode_rewards, n_steps return mean_reward, n_steps